# -*- coding: utf-8 -*- import os, errno def fileName(path):#获取文件夹 str = '' for i in range(1,len(path.split('\'))): str+=path.split('\')[i]+'\' return str def mkdir_p(path): #创建目录树 try: os.makedirs(path) except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5) if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def fileTraverse(filepath): #遍历filepath下所有文件,包括子目录 files = os.listdir(filepath) for fi in files: fi_d = os.path.join(filepath,fi) if os.path.isdir(fi_d): mkdir_p("E:\"+fileName(fi_d)) #创建文件夹,文件夹目录树 fileTraverse(fi_d)#递归遍历 else: print os.path.join(filepath,fi_d) root = 'F:\目标2' root = root.decode('utf-8')#目录名中有中文,需要decode fileTraverse(root)
# -*- coding: utf-8 -*- import os, errno import jieba.posseg as pseg def fileName(filePath):#获取文件夹 str = '' for i in range(1,len(filePath.split('\'))): str+=filePath.split('\')[i]+'\' return str def mkdir_p(path): #创建目录树 try: os.makedirs(path) except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5) if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def splitSentence(inputFile): filePath = os.path.dirname(inputFile) #获取路径名 name = os.path.basename(inputFile) #获取文件名 fin = open(inputFile,'r') #以读的方式打开文件 outputfile = "E:\" + fileName(filePath)#~~~~~~~~~~~~~~~~~~~~~~~~~~~源文件~~~~~~~~~~~~~~~~~~~~~~~~~~~ #outputfile = (outputfile+name).decode('utf-8') fout = open(outputfile+name,'w') #以写得方式打开文件 for eachLine in fin: line = eachLine.strip().decode('utf-8','ignore') #去除每行首尾可能出现的空格,并转为Unicode进行处理 line=line.strip(' ') #去掉多余空行 wordList = pseg.cut(line) #用结巴分词,对每行内容进行分词 outStr = '' for word in wordList:# #print word.word,word.flag outStr += word.word+'/'+word.flag #print outStr fout.write(outStr.encode('utf-8')) #将分词好的结果写入到输出文件 fout.write(' ') fin.close() fout.close() def fileTraverse(filePath): #遍历filepath下所有文件,包括子目录 files = os.listdir(filePath) for fi in files: fi_d = os.path.join(filePath,fi) if os.path.isdir(fi_d): #检验给出的路径是否是一个目录 mkdir_p("E:\"+fileName(fi_d))#~~~~~~~~~~~~~~~~~~~~~~~~~~~目标文件~~~~~~~~~~~~~~~~~~~~~~~~~~~ #创建文件夹,文件夹目录树 fileTraverse(fi_d)#递归遍历 else: #print os.path.join(filePath,fi_d)#y与fi_d相同 #print fi_d splitSentence(fi_d) root = 'F:\source' #~~~~~~~~~~~~~~~~~~~~~~~~~~~源文件~~~~~~~~~~~~~~~~~~~~~~~~~~~ root = root.decode('utf-8')#目录名中有中文,需要decode fileTraverse(root)