zoukankan      html  css  js  c++  java
  • python实现查看目录下重复的文件【转】

    转自:https://www.cnblogs.com/perfei/p/6138214.html

    该python 脚本有以下三个功能:

    1. 实现查看目录下重复的文件,输出文件按修改时间升序排列

    2. 将按修改时间排列比较旧的、可删除的文件列出来

    3. 按目录对重复文件进行统计,比如,目录/tmp  重复个数5,是指/tmp目录下有5个文件在其他地方也存在

    python脚本

    复制代码
    #!/usr/bin/env python
    #coding=utf-8
    '''
    Created on Nov 30, 2016
    
    @author: fangcheng
    '''
    from __future__ import print_function
    from operator import itemgetter  
    import os
    import time
    
    'tt为浮点型日期,换化为年月日时分秒格式时间'
    def timeYS(tt):
        t1 = time.localtime(tt)
        t2 = time.strftime("%Y-%m-%d %H:%M:%S",t1)
        return t2;  
            
    class File():
        '''
        copy move remove
        '''
        allfilecount = 0
        rddfilecount = 0
        singlefiles={}
        rddfiles={}
        rdddirs={}
    
        def __init__(self):
            '''
            Constructor
            '''
        def getFileMsg(self,filepath):
            '''
            以元组(filepath,ftime,size)形式输出文件信息
            '''
            if os.path.isfile(filepath):
                size = os.path.getsize(filepath)  #bytes B 
                if size <= 1024:
                    size ='{0}B'.format(size);
                elif size <= 1024*1024:
                    size = size/1024
                    size ='{0}K'.format(size);
                else:
                    size = size/1024/1024
                    size ='{0}M'.format(size);
                #filename = os.path.basename(filepath)
                
                ftime = timeYS(os.path.getmtime(filepath))
                return (filepath,ftime,size)
            return ()
        
        
        
        def setRedundanceFile(self,filepath):
            '''
            根据文件名称和大小判断文件是否重复,文件信息:元组(filepath,mtime,size) ,getFileMsg返回值
            1. 遍历某一目录下所有文件
            2. 将文件的名称及大小组成一个字符串,做为 key 放入字典 dict1 ,其 value 为 文件信息
            3. 每次放入时时判断 key 是否存在,若存在,就将 文件信息 放入字典 dict2
            4. dict2 的 key 为 文件名称,value为 文件信息 列表 list1
            '''
            try:
                if os.path.isdir(filepath):
                    for fil in os.listdir(filepath):
                        fil = os.path.join(filepath,fil)
                        self.setRedundanceFile(fil)
                elif os.path.isfile(filepath):
                    self.allfilecount = self.allfilecount + 1
                    size = os.path.getsize(filepath)  
                    filename = os.path.basename(filepath)
                    f = self.getFileMsg(filepath)
    
                    filekey = '{0}_{1}'.format(filename, size)
                    
                    if self.singlefiles.has_key(filekey):
                        self.rddfilecount = self.rddfilecount + 1
                        
                        #增加规则:发现一个重复文件时,在父目录下文件数加1,若是首次发现则取该文件在总文件列表的父目录,其数目也加1
                        pardir = os.path.dirname(filepath)
                        if self.rdddirs.has_key(pardir):
                            self.rdddirs[pardir] = self.rdddirs.get(pardir)+1
                        else:
                            self.rdddirs[pardir] = 1
                        
                        
                        if self.rddfiles.has_key(filekey) :
                            self.rddfiles[filekey].append(f)
                        else:
                            self.rddfiles[filekey] = [f]
                            f = self.singlefiles.get(filekey)
                            self.rddfiles[filekey].append(f)
                            #若是首次发现则取该文件在总文件列表的父目录,其数目也加1
                            pardir = os.path.dirname(f[0])
                            if self.rdddirs.has_key(pardir):
                                self.rdddirs[pardir] = self.rdddirs.get(pardir)+1
                            else:
                                self.rdddirs[pardir] = 1
                            
                            
                    else:
                        self.singlefiles[filekey]=f
                        
                else:
                    return
                        
            except Exception as e:
                print(e)
            
        
        def showFileCount(self):
            print(self.allfilecount)
        
        def showRedundanceFile(self,filepath):
            '''
            根据文件名称和大小判断文件是否重复
            '''
            self.allfilecount = 0
            self.rddfilecount = 0
            self.singlefiles={}
            self.rddfiles={}
            
            
            
            self.setRedundanceFile(filepath)
            print('the total file num:{0},the redundance file num(not including the first file):{1}'.format(self.allfilecount,self.rddfilecount))
            print('-----------------------------------------')
            for k in self.rddfiles.keys():
                for l in sorted(self.rddfiles.get(k), key=itemgetter(1)): #按修改日期升序排列
                    print(l);
                print('');
            print('------------------------------------------')
            
            
            
        def showCanRemoveFile(self,filepath):
            '''
            根据文件名称和大小判断文件是否重复
            输出按修改时间较旧的文件
            '''
            self.allfilecount = 0
            self.rddfilecount = 0
            self.singlefiles={}
            self.rddfiles={}
            rmlist = []
            self.setRedundanceFile(filepath)
            
            for k in self.rddfiles.keys():
                tmplist = sorted(self.rddfiles.get(k), key=itemgetter(1))
                tmplist.pop()
                rmlist.extend(tmplist)
            for rl in rmlist:
                print(rl[0])
            
        def rdddirstat(self):  
            '''
            按目录统计文件重复个数
            输出:目录/tmp  重复个数5,是指/tmp目录下有5个文件在其他地方也存在
            
            '''
            if len(self.rdddirs)> 0 :
                print('The redundance file statistics by dirs:')
                for rd in self.rdddirs.keys():
                    print('{0} {1}'.format(rd, self.rdddirs.get(rd)))
            else:
                print('There are no redundance files')
            
    if __name__ == '__main__':
        f = File()
        filepath = os.getcwd()
        #filepath = '/scripts'
        
        f.showRedundanceFile(filepath) #查看多余的文件
        #f.showCanRemoveFile(filepath)  #按修改时间给出比较旧的多余文件
        f.rdddirstat()                 #按目录统计重复文件个数
    复制代码

    脚本添加执行权限后,可直接在服务器上执行
    chmod +x findrdd.py

    linux上执行示例
    复制代码
    [root@bak scripts]# ./findrdd.py 
    the total file num:33,the redundance file num(not including the first file):5
    -----------------------------------------
    ('/scripts/bkapp.sh', '2016-03-09 16:31:03', '3K')
    ('/scripts/esgcc/bkapp.sh', '2016-03-10 11:06:06', '3K')
    
    ('/scripts/show_rollbak.txt', '2016-03-09 10:50:02', '2K')
    ('/scripts/esgcc/show_rollbak.txt', '2016-03-10 11:06:06', '2K')
    
    ('/scripts/esgcc/deploy.sh', '2016-03-10 11:36:19', '8K')
    ('/scripts/deploy.sh', '2016-03-11 11:42:04', '8K')
    
    ('/scripts/rollback.sh', '2016-03-10 10:22:33', '10K')
    ('/scripts/esgcc/rollback.sh', '2016-03-10 11:06:06', '10K')
    
    ('/scripts/show_deploy.txt', '2016-03-09 10:50:02', '2K')
    ('/scripts/esgcc/show_deploy.txt', '2016-03-10 11:06:06', '2K')
    
    ------------------------------------------
    The redundance file statistics by dirs:
    /scripts 5
    /scripts/esgcc 5
    复制代码

    windows上执行示例(需要安装python):

    复制代码
    C:UsersfeiDesktop	mp>python findrdd.py
    the total file num:42,the redundance file num(not including the first file):10
    -----------------------------------------
    ('C:\Users\fei\Desktop\tmp\build\build\src\application\application.css', '2016-11-22 13:11:51', '101B')
    ('C:\Users\fei\Desktop\tmp\build\project\src\application\application.css', '2016-11-22 13:11:51', '101B')
    ('C:\Users\fei\Desktop\tmp\build\build\classes\application\application.css', '2016-11-22 13:11:53', '101B')
    
    ('C:\Users\fei\Desktop\tmp\build\project\src\login\Login.java', '2016-11-22 13:11:51', '3K')
    ('C:\Users\fei\Desktop\tmp\build\build\src\login\Login.java', '2016-11-22 13:11:52', '3K')
    
    ('C:\Users\fei\Desktop\tmp\build\dist\LoginCSS.jar', '2016-11-22 13:11:53', '55K')
    ('C:\Users\fei\Desktop\tmp\build\deploy\LoginCSS.jar', '2016-11-22 13:11:54', '55K')
    
    ('C:\Users\fei\Desktop\tmp\build\project\src\login\background.jpg', '2016-11-22 13:11:51', '51K')
    ('C:\Users\fei\Desktop\tmp\build\build\src\login\background.jpg', '2016-11-22 13:11:52', '51K')
    ('C:\Users\fei\Desktop\tmp\build\build\classes\login\background.jpg', '2016-11-22 13:11:53', '51K')
    
    ('C:\Users\fei\Desktop\tmp\build\project\src\application\Main.java', '2016-11-22 13:11:50', '633B')
    ('C:\Users\fei\Desktop\tmp\build\build\src\application\Main.java', '2016-11-22 13:11:51', '633B')
    
    ('C:\Users\fei\Desktop\tmp\build\project\src\login\Test.java', '2016-11-22 13:11:51', '443B')
    ('C:\Users\fei\Desktop\tmp\build\build\src\login\Test.java', '2016-11-22 13:11:52', '443B')
    
    ('C:\Users\fei\Desktop\tmp\build\project\src\login\Login.css', '2016-11-22 13:11:51', '2K')
    ('C:\Users\fei\Desktop\tmp\build\build\src\login\Login.css', '2016-11-22 13:11:52', '2K')
    ('C:\Users\fei\Desktop\tmp\build\build\classes\login\Login.css', '2016-11-22 13:11:53', '2K')
    
    ------------------------------------------
    The redundance file statistics by dirs:
    C:UsersfeiDesktop	mpuilduildsrcapplication 2
    C:UsersfeiDesktop	mpuilddeploy 1
    C:UsersfeiDesktop	mpuilduildclassesapplication 1
    C:UsersfeiDesktop	mpuildprojectsrclogin 4
    C:UsersfeiDesktop	mpuilddist 1
    C:UsersfeiDesktop	mpuilduildclasseslogin 2
    C:UsersfeiDesktop	mpuildprojectsrcapplication 2
    复制代码
    输出结果中第二个方法-输出可删除文件列表注释掉了,该删除方式仅供参考,是否按这种“最新修改的文件就是有效文件、其他文件皆可不要”方式筛选尚需自我决定。
    【作者】张昺华
    【大饼教你学系列】https://edu.csdn.net/course/detail/10393
    【新浪微博】 张昺华--sky
    【twitter】 @sky2030_
    【微信公众号】 张昺华
    本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利.
  • 相关阅读:
    Java_适配器模式
    linux常用命令整理
    (转)使用隐藏的iframe 隐藏form提交,仿AJax无刷新提交,可以实现无刷新上传文件
    mysql添加并返回主键
    学习RMI
    关于bcprov-jdk16
    JavaScript在页面中的引用方法
    通过CFX发布WebService(一)
    字符串和json数据的转换
    MD5 加密与解密
  • 原文地址:https://www.cnblogs.com/sky-heaven/p/15075073.html
Copyright © 2011-2022 走看看