zoukankan      html  css  js  c++  java
  • 词频统计

    import os
    import codecs
    import chardet
    
    
    word_lst = []
    word_dict = {}
    
    infile=input("请输入统计文件名:")
    outfile=input("请输入输出结果文件名:")
    
    
    exclude_str = input("请输入过滤字符:")
    n = input("统计前多少位:")
    
    #oldfile:UTF8文件的路径
    #newfile:要保存的ANSI文件的路径
    def convertUTF8ToANSI(infile):
    
        #打开UTF8文本文件
        f = codecs.open(infile,'r','utf8')
        utfstr = f.read()
        f.close()
        
        #把UTF8字符串转码成ANSI字符串
        outansestr = utfstr.encode('mbcs')
    
        #使用二进制格式保存转码后的文本
        f = open(infile,'wb')
        f.write(outansestr)
        f.close()
    
    #读取docx文档
    def ReadWord():
        # 读取word
    
        fword = docx.Document(infile) # 括号内的为文件路径
        for para in fword.paragraphs: # 读取word的每一段内容
              for char in para.text:
                  word_lst.append(char)
              #para.text即为该段落的内容
        print(word_lst) # 返回读取到的文件内容
    
    def ReadTxt():
        try:
            convertUTF8ToANSI(infile)
        except :
            print("编码格式错误")
        
        # 添加每一个字到列表中
        for line in fileIn:
            for char in line:
                word_lst.append(char)
               
    
        
    with open(infile,"r") as fileIn ,open(outfile,'w') as fileOut:
        fileName = infile.split('.')
        if fileName[-1] == "docx":
            ReadWord()
        if fileName[-1] == "txt":
            ReadTxt()
                 
      
        # 用字典统计每个字出现的个数
        word_lst=[x.strip() for x in word_lst if x.strip() != '']
        for char in word_lst:
            if char not in exclude_str:
                word_dict[char] = word_dict.get(char,0) + 1
      
    
        # 排序
        #   x[1]是按字频排序,x[0]则是按字排序
        lstWords = sorted(word_dict.items(), key=lambda x:x[1],  reverse=True)
    
        # 输出结果 (前100)
        print('字符	字频')
        print('=============')
        i = 1
        if n == '':
            for e in lstWords:
                print(str(i) + '	' + '%s	%d' % e)
                i+=1
                fileOut.write('%s, %d
    ' % e)
        else:
            n = int(n)
            for e in lstWords[:n]:
                print(str(i) + '	' + '%s	%d' % e)
                i+=1
                fileOut.write('%s, %d
    ' % e)
  • 相关阅读:
    Tuesday / Wednesday = Increased Response
    脚本语言
    py2exe
    脚本语言
    访问者模式
    C调用lua脚本的效率测试
    Python编码规范
    py2exe
    Python编码规范
    访问者模式
  • 原文地址:https://www.cnblogs.com/jestin/p/12911418.html
Copyright © 2011-2022 走看看