参考:http://my.oschina.net/waterbear/blog/149852
chardet模块,
能够实现文本编码的检查,
核心代码:
import chardet chardet.detect(content)['encoding']
实现目录java文件转码:
#-*- coding: utf-8 -*- import codecs import os import shutil import re import chardet def convert_encoding(filename, target_encoding): # Backup the origin file. shutil.copyfile(filename, filename + '.bak') # convert file from the source encoding to target encoding content = codecs.open(filename, 'r').read() source_encoding = chardet.detect(content)['encoding'] print source_encoding, filename content = content.decode(source_encoding) #.encode(source_encoding) codecs.open(filename, 'w', encoding=target_encoding).write(content) def main(): for root, dirs, files in os.walk(os.getcwd()): for f in files: if f.lower().endswith('.java'): filename = os.path.join(root, f) try: convert_encoding(filename, 'utf-8') except Exception, e: print filename def process_bak_files(action='restore'): for root, dirs, files in os.walk(os.getcwd()): for f in files: if f.lower().endswith('.java.bak'): source = os.path.join(root, f) target = os.path.join(root, re.sub('.java.bak$', '.java', f, flags=re.IGNORECASE)) try: if action == 'restore': shutil.move(source, target) elif action == 'clear': os.remove(source) except Exception, e: print source if __name__ == '__main__': # process_bak_files(action='clear') main()
另,参考:Python 的中文编码处理
http://in355hz.iteye.com/blog/1860787
- # 检查标准输出流的编码
- print sys.stdout.encoding
- # 无论如何,请用 linux 系统的当前字符集输出:
- if sys.stdout.encoding is None:
- enc = os.environ['LANG'].split('.')[1]
- sys.stdout = codecs.getwriter(enc)(sys.stdout) # 替换 sys.stdout
- # 使得 sys.getdefaultencoding() 的值为 'utf-8'
- reload(sys) # reload 才能调用 setdefaultencoding 方法
- sys.setdefaultencoding('utf-8') # 设置 'utf-8'