zoukankan      html  css  js  c++  java
  • python 自定义 遍历文件夹下所有文件

    1. 两种方式列表顺序不一样,内容一下
    import os
    
    # 方法1(递归完一个目录)
    def get_process_files(root_dir):
        """process all files in directory"""
        cur_dir=os.path.abspath(root_dir)
        file_list=os.listdir(cur_dir)
        process_list=[]
        for file in file_list:
            fullfile=cur_dir+"\"+file
            if os.path.isfile(fullfile):  # 检查文件(不是目录)就return 出递归
                process_list.append(fullfile)
            elif os.path.isdir(fullfile):
                dir_extra_list=get_process_files(fullfile)
                if len(dir_extra_list)!=0:
                    for x in dir_extra_list:
                        process_list.append(x)
        return process_list
    
    print(get_process_files(r'C:	est'))
    
    
    # 方法2 (一个目录下所有文件返回完,才遍历另一个目录)
    def get_process_files(path):
        process_list = []
        # r=root, d=directories, f = files
        for r, d, f in os.walk(path):
            for file in f:
                # if '.txt' in file:
                    process_list.append(os.path.join(r, file))
    
        return process_list
    
    print(get_process_files(r'C:	est'))
    
    
    1. 查找文件内容工具(txt,doc,pdf)
    
    import os
    import re
    import sys
    from docx import Document
    import PyPDF2
    
    
    
    # 文件类型匹配列表
    FileTypeList       = [".xml", ".tpl", ".txt",'.doc', '.docx','.pdf']
    # FileNameFilterList = ['1.txt']                 # 指定文件名搜索列表
    FileNameFilterList = []                         # 指定文件名搜索列表
    fileNameList       = []                         # 待搜索文件列表
    
    
    # 文件夹存在?
    def isFolderExist(dir):
        if (os.path.exists(dir)):
            return True;
        else:
            return False;
    
    
    # 指定文件名搜索
    def isFileNameContainStr(fileNameFilterStrList, filename):
        if len(fileNameFilterStrList) == 0:  # 没指定搜索全部
            return True
        for filterStr in fileNameFilterStrList:
            if filterStr in filename:
                return True
        return False
    
    
    # 指定文件类型
    def isFileNameContainType(typeList, filename):
        if len(typeList) == 0:  # 没指定搜索全部
            return True;
        for type in typeList:
            if os.path.splitext(filename)[1] == type:
                return True
        return False
    
    
    # 返回文件类型
    def file_type(filename):
        return os.path.splitext(filename)[1].upper()
    
    
    # 待搜索文件列表,满足过滤条件后
    def listFile(path, fileNameFilterList, typeList):
        if not isFolderExist(path):
            return False
        for filename in os.listdir(path):
            if os.path.isdir(path + "\" + filename):
                listFile(path + "\" + filename, FileNameFilterList, typeList)
            if os.path.isfile(path + "\" + filename):
                if False == isFileNameContainStr(fileNameFilterList, filename):
                    continue
                if isFileNameContainType(typeList, filename):
                    fileNameList.append(path + "\" + filename)
                    continue
        return True
    
    
    
    # FindStrList        = ['test2','t2']            # 文件内容匹配列表
    # # 搜索txt
    # def findFromFile(filename, strlist):
    #     file = open(filename)
    #     count = 0
    #     for line in file:
    #         #if '$' in line:
    #         #    continue
    #         count = count+1
    #         isContained = True
    #         for str in strlist:
    #             if str not in line:
    #                 isContained = False
    #                 break
    #         if isContained == True:
    #             print (f'{filename}; line:{count},{line}')
    #     file.close()
    
    # # 搜索过滤后 fileNameList 列表
    # def findFromDir(strlist):
    #     for name in fileNameList:
    #         findFromFile(name, strlist)
    
    
    # 搜索txt ,区分大小写
    def find_txt(filename, str):
        file = open(filename)
        count = 0
        for line in file:
            #if '$' in line:
            #    continue
            count = count+1
            isContained = True
    
            if str not in line:
                isContained = False
                # continue
            if isContained == True:
                line=line.strip()
                print (f'{filename}; line:{count} ({line})')
        file.close()
    
    
    # 搜索word ,区分大小写
    def find_doc(filename, str):
        document = Document(filename)
        count = 0
        # l = [ paragraph.text.encode('gb2312') for paragraph in document.paragraphs]
        l = [ paragraph.text for paragraph in document.paragraphs]
        for line in l:
            count = count + 1
            i=line.strip()
            if line.find(str) !=-1:
                print(f'{filename}; line:{count} ({line})')
    
    
    # 搜索pdf
    def find_pdf(filename, str):
        # file = open('C:/test/G.8273.2-201908.pdf', 'rb')
        file = open(filename, 'rb')
    
        fileReader = PyPDF2.PdfFileReader(file)
        num = fileReader.numPages
        for i in range(num):
            pageObj = fileReader.getPage(i)
            if str in pageObj.extractText():
                contents = pageObj.extractText().split('
    ')
                for line in contents:
                    if str in line:
                        index = contents.index(line)
                        print(f'{filename}; page:{i+1},line:{index} ({line})')
    
    
    
    # 搜索过滤后 fileNameList 列表
    def findFromDir(str):
        for name in fileNameList:
            if file_type(name) in ['.TXT','.XML']:
                find_txt(name, str)
            if file_type(name) in ['.DOC', '.DOCX']:
                find_doc(name, str)
            if file_type(name) in ['.PDF', ]:
                find_pdf(name, str)
    
    
    # 交互输入
    def askInput():
        path = input('What is folder path ? ')
        text = input('Searching for what? ')
        print()
        return text, path
    
    
    
    if __name__ == "__main__":
    
        # DIR = "C:\test"  # 文件目录
        # FindStrList = 'test'  # 文件内容匹配列表
        FindStrList, DIR = askInput()
        if not listFile(DIR, FileNameFilterList, FileTypeList):
            print ("FILE PATH ERROR")
            sys.exit()
        findFromDir(FindStrList)
        print ("FIND END")
    
    
  • 相关阅读:
    获取最近6个月的年月(yyyyMM,不包括当月)
    checkbox与<c:forEach>在开发中遇到的问题记录
    MyBatis开发-->增删改
    MyBatis开发-->接口方式编程
    MyBatis开发-->入门
    android-async-http框架之与网络进行数据交互
    android-async-http框架之与服务器进行数据交互
    jQuery截取{}里的字符串及获取json里的值
    SSH整合之三:添加Hibernate环境且使之与Spring进行整合
    angular源码剖析之Provider系列--QProvider
  • 原文地址:https://www.cnblogs.com/amize/p/14228504.html
Copyright © 2011-2022 走看看