今天用wxPython做了一个GUI程序,实现查找指定目录内的相同文件,主要原理是计算文件的md5值(计算前先找出文件大小相同的文件,然后计算这些文件的md5值,而不是所有文件都计算,大大减少了md5的计算量),加入了多线程功能。
以下是其脚本版本(无需安装wxPython)
UNIQFile-script.py
1 # -*- coding: gbk -*- 2 3 ''' 4 Author:@DoNotSpyOnMe 5 Blog: http://www.cnblogs.com/aaronhoo 6 ''' 7 8 import hashlib 9 import os 10 import threading 11 12 def getFileSize(filePath): 13 return os.path.getsize(filePath) 14 15 ''' 一般文件的md5计算方法,一次读取文件的全部内容''' 16 def CalcMD5(filepath): 17 with open(filepath,'rb') as f: 18 md5obj = hashlib.md5() 19 md5obj.update(f.read()) 20 hash = md5obj.hexdigest() 21 return hash 22 '''大文件计算md5的方法,分批读取文件内容,防止内存爆掉''' 23 def GetFileMd5(filename): 24 if not os.path.isfile(filename): 25 return 26 myhash = hashlib.md5() 27 f = open(filename,'rb') 28 while True: 29 b = f.read(8*1024) 30 if not b : 31 break 32 myhash.update(b) 33 f.close() 34 return myhash.hexdigest() 35 36 def GetAllFiles(directory): 37 files=[] 38 for dirpath, dirnames,filenames in os.walk(directory): 39 if filenames!=[]: 40 for file in filenames: 41 files.append(dirpath+'\'+file)
files.sort(key=len)#按照文件名的长度排序 42 return files 43 44 def findSameSizeFiles(files): 45 dicSize={} 46 for f in files: 47 size=getFileSize(f) 48 if not dicSize.has_key(size): 49 dicSize[size]=f 50 else: 51 dicSize[size]=dicSize[size]+';'+f 52 dicCopy=dicSize.copy() 53 for k in dicSize.iterkeys(): 54 if dicSize[k].find(';')==-1: 55 dicCopy.pop(k) 56 del dicSize 57 return dicCopy 58 59 def findSameMD5Files(files): 60 dicMD5={} 61 for f in files: 62 print 'calculating the md5 value of file %s'%f 63 md5=GetFileMd5(f) 64 if not dicMD5.has_key(md5): 65 dicMD5[md5]=f 66 else: 67 dicMD5[md5]=dicMD5[md5]+';'+f 68 dicCopy=dicMD5.copy() 69 for k in dicMD5.iterkeys(): 70 if dicMD5[k].find(';')==-1: 71 dicCopy.pop(k) 72 del dicMD5 73 return dicCopy 74 75 def removeSameFile(mydir): 76 msg='' 77 msgUniq='Result:No file is removed since they are all uniq.' 78 try: 79 existsFlag=False 80 files=GetAllFiles(mydir) 81 print'%s files found in directory %s '%(len(files),mydir) 82 dicFileOfSameSize=findSameSizeFiles(files) 83 if dicFileOfSameSize=={}: 84 print msgUniq 85 return 86 else: 87 #list the duplicated files first: 88 dicFiltered={} 89 for k in dicFileOfSameSize.iterkeys(): 90 filesOfSameSize=dicFileOfSameSize[k].split(';') 91 dicSameMD5file=findSameMD5Files(filesOfSameSize) 92 if dicSameMD5file!={}: 93 existsFlag=True 94 for k in dicSameMD5file.iterkeys(): 95 msg=msg+'md5 %s: %s'%(k,dicSameMD5file[k])+' ' 96 dicFiltered[k]=dicSameMD5file[k] 97 if not existsFlag: 98 msg=msgUniq 99 return 100 else: 101 msg='Duplicated files: '+msg+' ' 102 #then remove the duplicated files: 103 removeCount=0 104 for k in dicFiltered.iterkeys(): 105 sameFiles=dicFiltered[k].split(';') 106 flagRemove=False 107 for f in sameFiles: 108 if not flagRemove: 109 flagRemove=True 110 else: 111 msg=msg+'Removing file: %s'%f+' ' 112 os.remove(f) 113 removeCount=removeCount+1 114 msg=msg+'%s files are removed. '%removeCount 115 except Exception,e: 116 print e 117 # msg='Exception occured.' 118 finally: 119 print msg+' '+'Operation finished.' 120 121 122 def listSameFile(mydir): 123 msg='' 124 msgUniq='Result:All files are uniq.' 125 try: 126 existsFlag=False 127 files=GetAllFiles(mydir) 128 print '%s files found in directory %s '%(len(files),mydir) 129 dicFileOfSameSize=findSameSizeFiles(files) 130 if dicFileOfSameSize=={}: 131 print msgUniq 132 return 133 else: 134 for k in dicFileOfSameSize.iterkeys(): 135 filesOfSameSize=dicFileOfSameSize[k].split(';') 136 dicSameMD5file=findSameMD5Files(filesOfSameSize) 137 if dicSameMD5file!={}: 138 existsFlag=True 139 for k in dicSameMD5file.iterkeys(): 140 msg=msg+'md5 %s: %s'%(k,dicSameMD5file[k])+' ' 141 if not existsFlag: 142 msg=msgUniq 143 else: 144 msg='Duplicated files: '+msg 145 except Exception,e: 146 print e 147 # msg='Exception occured.' 148 finally: 149 print msg+' '+'Operation finished.' 150 151 152 if __name__=="__main__": 153 print 'This program is designed for clearing the duplicated files and saving memory space.Select a directory and we will find or remove the duplicated files.' 154 print 'All rights are reserved by @DoNotSpyOnMe' 155 print ' ' 156 157 print "You have three options:" 158 print "'f' for finding the duplicated files in the directory that you're required to enter later,or" 159 print "'r' for finding and the removing the duplicated file,or" 160 print "'q' to quit" 161 while True: 162 option=raw_input('Please enter your option: ') 163 option=option.lower() 164 while option!='f' and option!='r' and option!='q': 165 option=raw_input('Please enter your option: ') 166 if option=='f' or option=='r': 167 mydir=raw_input('Please enter the direcotry containing files: ') 168 mydir=mydir.lower() 169 while mydir.find('\')==-1 or not os.path.isdir(mydir): 170 mydir=raw_input('Please enter a valid direcotry containing files: ') 171 if option=='f': 172 listSameFile(mydir) 173 else: 174 removeSameFile(mydir) 175 elif option=='q': 176 exit(0) 177 print '' 178