参考资料:python中的encode()和decode()函数
查看文件编码 + 查看文件扩展名
import os import sys import codecs import chardet def GetFileEncodingFormat(file): fileHandle = open(file, 'r', errors = 'ignore') fileContext = fileHandle.read() return chardet.detect(fileContext.encode())["encoding"] def GetFileExtension(file): (filepath, filename) = os.path.split(file) (shortname, extension) = os.path.splitext(filename) return extension def CovertFileCodeFormat(file, out_encode): try: encoding = GetFileEncodingFormat(file) extension = GetFileExtension(file) if (encoding != out_encode and (extension == '.c' or extension == '.h')): fileHandle = codecs.open(file, 'r', encoding, errors = 'ignore') fileContext = fileHandle.read() codecs.open(file, 'w', out_encode, errors = 'ignore').write(fileContext) print ("convert:" + file + " sucess") except IOError as err: print ("I/O error: {0}".format(err)) def ProcessDir(path): for root, dirs, files in os.walk(path): for file in files: filePath = os.path.join(root, file) CovertFileCodeFormat(filePath, sys.argv[2]) def main(): path = sys.argv[1] if (os.path.isfile(path)): CovertFileCodeFormat(path, sys.argv[2]) elif (os.path.isdir(path)): ProcessDir(path) else: pass
查看文件编码+扩展名
filepath = r'C:UsersAdministratorDesktop jhcfx_1205zjtpymplan_1204.csv' GetFileEncodingFormat(filepath) GetFileExtension(filepath) def ProcessDir(path): for root, dirs, files in os.walk(path): for file in files: filePath = os.path.join(root, file) #CovertFileCodeFormat(filePath, sys.argv[2]) print(GetFileEncodingFormat(filePath)) def main(): path = sys.argv[1] if (os.path.isfile(path)): #CovertFileCodeFormat(path, sys.argv[2]) pass elif (os.path.isdir(path)): ProcessDir(path) else: pass
文件编码转换
def GB18030ToUTF8(path, new_path, chunksize): for root, dirs, files in os.walk(path): for file in files: #if file not in ('zjtpjl_1204.csv'): # continue filePath = os.path.join(root, file) #print(filePath, ' ', GetFileEncodingFormat(filepath)) chunks = pd.read_csv(filePath, chunksize=chunksize, encoding='gb18030', engine='python', dtype=str, na_values='') filePath = os.path.join(new_path, file) flag = 1 for chunk in chunks: if flag==1: chunk.to_csv(filePath, encoding='utf_8_sig', index=False, header=True) flag = 0 else: chunk.to_csv(filePath, encoding='utf_8_sig', mode='a+', index=False, header=False) # #测试 chunksize = 1000000 path = r'C:UsersAdministratorDesktop jhcfx_1205' GB18030ToUTF8(path, path+'1', chunksize) #测试转换后文件是否可读 #filepath = r'C:UsersAdministratorDesktop jhcfx_12051zjtpjl_1204.csv' #aa = pd.read_csv(filepath, encoding='utf_8_sig', dtype=str)
#content = open(filepath).read().decode("gb18030")
#open("C:\Users\Administrator\Desktop\njhcfx_1205\zjtpymplan_1205.txt","w").write(content.encode("utf8"))