- 两种方式列表顺序不一样,内容一下
import os
# 方法1(递归完一个目录)
def get_process_files(root_dir):
"""process all files in directory"""
cur_dir=os.path.abspath(root_dir)
file_list=os.listdir(cur_dir)
process_list=[]
for file in file_list:
fullfile=cur_dir+"\"+file
if os.path.isfile(fullfile): # 检查文件(不是目录)就return 出递归
process_list.append(fullfile)
elif os.path.isdir(fullfile):
dir_extra_list=get_process_files(fullfile)
if len(dir_extra_list)!=0:
for x in dir_extra_list:
process_list.append(x)
return process_list
print(get_process_files(r'C: est'))
# 方法2 (一个目录下所有文件返回完,才遍历另一个目录)
def get_process_files(path):
process_list = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
for file in f:
# if '.txt' in file:
process_list.append(os.path.join(r, file))
return process_list
print(get_process_files(r'C: est'))
- 查找文件内容工具(txt,doc,pdf)
import os
import re
import sys
from docx import Document
import PyPDF2
# 文件类型匹配列表
FileTypeList = [".xml", ".tpl", ".txt",'.doc', '.docx','.pdf']
# FileNameFilterList = ['1.txt'] # 指定文件名搜索列表
FileNameFilterList = [] # 指定文件名搜索列表
fileNameList = [] # 待搜索文件列表
# 文件夹存在?
def isFolderExist(dir):
if (os.path.exists(dir)):
return True;
else:
return False;
# 指定文件名搜索
def isFileNameContainStr(fileNameFilterStrList, filename):
if len(fileNameFilterStrList) == 0: # 没指定搜索全部
return True
for filterStr in fileNameFilterStrList:
if filterStr in filename:
return True
return False
# 指定文件类型
def isFileNameContainType(typeList, filename):
if len(typeList) == 0: # 没指定搜索全部
return True;
for type in typeList:
if os.path.splitext(filename)[1] == type:
return True
return False
# 返回文件类型
def file_type(filename):
return os.path.splitext(filename)[1].upper()
# 待搜索文件列表,满足过滤条件后
def listFile(path, fileNameFilterList, typeList):
if not isFolderExist(path):
return False
for filename in os.listdir(path):
if os.path.isdir(path + "\" + filename):
listFile(path + "\" + filename, FileNameFilterList, typeList)
if os.path.isfile(path + "\" + filename):
if False == isFileNameContainStr(fileNameFilterList, filename):
continue
if isFileNameContainType(typeList, filename):
fileNameList.append(path + "\" + filename)
continue
return True
# FindStrList = ['test2','t2'] # 文件内容匹配列表
# # 搜索txt
# def findFromFile(filename, strlist):
# file = open(filename)
# count = 0
# for line in file:
# #if '$' in line:
# # continue
# count = count+1
# isContained = True
# for str in strlist:
# if str not in line:
# isContained = False
# break
# if isContained == True:
# print (f'{filename}; line:{count},{line}')
# file.close()
# # 搜索过滤后 fileNameList 列表
# def findFromDir(strlist):
# for name in fileNameList:
# findFromFile(name, strlist)
# 搜索txt ,区分大小写
def find_txt(filename, str):
file = open(filename)
count = 0
for line in file:
#if '$' in line:
# continue
count = count+1
isContained = True
if str not in line:
isContained = False
# continue
if isContained == True:
line=line.strip()
print (f'{filename}; line:{count} ({line})')
file.close()
# 搜索word ,区分大小写
def find_doc(filename, str):
document = Document(filename)
count = 0
# l = [ paragraph.text.encode('gb2312') for paragraph in document.paragraphs]
l = [ paragraph.text for paragraph in document.paragraphs]
for line in l:
count = count + 1
i=line.strip()
if line.find(str) !=-1:
print(f'{filename}; line:{count} ({line})')
# 搜索pdf
def find_pdf(filename, str):
# file = open('C:/test/G.8273.2-201908.pdf', 'rb')
file = open(filename, 'rb')
fileReader = PyPDF2.PdfFileReader(file)
num = fileReader.numPages
for i in range(num):
pageObj = fileReader.getPage(i)
if str in pageObj.extractText():
contents = pageObj.extractText().split('
')
for line in contents:
if str in line:
index = contents.index(line)
print(f'{filename}; page:{i+1},line:{index} ({line})')
# 搜索过滤后 fileNameList 列表
def findFromDir(str):
for name in fileNameList:
if file_type(name) in ['.TXT','.XML']:
find_txt(name, str)
if file_type(name) in ['.DOC', '.DOCX']:
find_doc(name, str)
if file_type(name) in ['.PDF', ]:
find_pdf(name, str)
# 交互输入
def askInput():
path = input('What is folder path ? ')
text = input('Searching for what? ')
print()
return text, path
if __name__ == "__main__":
# DIR = "C:\test" # 文件目录
# FindStrList = 'test' # 文件内容匹配列表
FindStrList, DIR = askInput()
if not listFile(DIR, FileNameFilterList, FileTypeList):
print ("FILE PATH ERROR")
sys.exit()
findFromDir(FindStrList)
print ("FIND END")