- #coding=utf-8
- import binascii, os
- import hashlib
- def getFilesize(path):#遍历文件夹
- try:
- if os.path.isdir(path):
- files = os.listdir(path)
- for file in files:
- getFilesize(path + "/" + file)
- else:
- size = os.path.getsize(path)
- if not filesizes.has_key(size):
- filesizes[size] = []
- filesizes[size].append(path)
- except Exception as e:
- print e
- errfile.append(path)#追加到错误列表
- def getSizeEq(filesizes):#返回长度大于1的字典
- for key in filesizes.keys():
- if len(filesizes[key])<=1:#相同大小文件长度大于一
- #print filesizes[key]
- filesizes.pop(key)
- return filesizes;
- def getFileHash(files):#获取哈希
- srcl=open(files,'rb')
- m2 = hashlib.md5()
- m2.update(srcl.read())
- srcl.close()
- return m2.hexdigest()
- def getCrc32(files):#获取Crc
- f = open(files, "r")
- crc = binascii.crc32(f.read())
- f.close()
- return crc
- def Eq(val1,val2):#比较值
- if val1==val2:
- return True
- return False
- #######################
- filesizes = {}#所有文件
- Eqlfiles = {}#相同文件
- errfile=[]#扫描错误的文件集合
- path="./"#默认路径
- getFilesize(path)
- filesizes=getSizeEq(filesizes);
- for key in filesizes:#遍历同样的小的文件 键
- files=filesizes[key]
- for i in files:#遍历同样的小的文件 列
- #print u'文件',i,key
- md5a=getFileHash(i)
- crca=getCrc32(i)
- if Eqlfiles.has_key(md5a):
- if i in Eqlfiles[md5a]:
- continue
- for j in files:#遍历相同大小文件 列2
- if files.index(i)>files.index(j):
- md5b=getFileHash(j)
- crcb=getCrc32(j)
- if Eq(md5a,md5b) and Eq(crca,crcb):
- if not Eqlfiles.has_key(md5a):#创建列
- Eqlfiles[md5a]=[]
- if i not in Eqlfiles[str(md5a)]:
- Eqlfiles[str(md5a)].append(i)#追加文件名
- if j not in Eqlfiles[str(md5a)]:
- Eqlfiles[str(md5a)].append(j)#追加文件名
- print u'错误的文件:',errfile
- for key in Eqlfiles:
- print '--------------------------------------------------------------------'
- for i in Eqlfiles[key]:
- print u'文件',i,key
- #该片段来自于http://www.codesnippet.cn/detail/2707201614886.html
来源: http://www.codesnippet.cn/detail/2707201614886.html