zoukankan      html  css  js  c++  java
  • 词频统计

    import os
    import codecs
    import chardet
    
    
    word_lst = []
    word_dict = {}
    
    infile=input("请输入统计文件名:")
    outfile=input("请输入输出结果文件名:")
    
    
    exclude_str = input("请输入过滤字符:")
    n = input("统计前多少位:")
    
    #oldfile:UTF8文件的路径
    #newfile:要保存的ANSI文件的路径
    def convertUTF8ToANSI(infile):
    
        #打开UTF8文本文件
        f = codecs.open(infile,'r','utf8')
        utfstr = f.read()
        f.close()
        
        #把UTF8字符串转码成ANSI字符串
        outansestr = utfstr.encode('mbcs')
    
        #使用二进制格式保存转码后的文本
        f = open(infile,'wb')
        f.write(outansestr)
        f.close()
    
    #读取docx文档
    def ReadWord():
        # 读取word
    
        fword = docx.Document(infile) # 括号内的为文件路径
        for para in fword.paragraphs: # 读取word的每一段内容
              for char in para.text:
                  word_lst.append(char)
              #para.text即为该段落的内容
        print(word_lst) # 返回读取到的文件内容
    
    def ReadTxt():
        try:
            convertUTF8ToANSI(infile)
        except :
            print("编码格式错误")
        
        # 添加每一个字到列表中
        for line in fileIn:
            for char in line:
                word_lst.append(char)
               
    
        
    with open(infile,"r") as fileIn ,open(outfile,'w') as fileOut:
        fileName = infile.split('.')
        if fileName[-1] == "docx":
            ReadWord()
        if fileName[-1] == "txt":
            ReadTxt()
                 
      
        # 用字典统计每个字出现的个数
        word_lst=[x.strip() for x in word_lst if x.strip() != '']
        for char in word_lst:
            if char not in exclude_str:
                word_dict[char] = word_dict.get(char,0) + 1
      
    
        # 排序
        #   x[1]是按字频排序,x[0]则是按字排序
        lstWords = sorted(word_dict.items(), key=lambda x:x[1],  reverse=True)
    
        # 输出结果 (前100)
        print('字符	字频')
        print('=============')
        i = 1
        if n == '':
            for e in lstWords:
                print(str(i) + '	' + '%s	%d' % e)
                i+=1
                fileOut.write('%s, %d
    ' % e)
        else:
            n = int(n)
            for e in lstWords[:n]:
                print(str(i) + '	' + '%s	%d' % e)
                i+=1
                fileOut.write('%s, %d
    ' % e)
  • 相关阅读:
    hdu 4027 Can you answer these queries? 线段树
    ZOJ1610 Count the Colors 线段树
    poj 2528 Mayor's posters 离散化 线段树
    hdu 1599 find the mincost route floyd求最小环
    POJ 2686 Traveling by Stagecoach 状压DP
    POJ 1990 MooFest 树状数组
    POJ 2955 Brackets 区间DP
    lightoj 1422 Halloween Costumes 区间DP
    模板 有源汇上下界最小流 loj117
    模板 有源汇上下界最大流 loj116
  • 原文地址:https://www.cnblogs.com/jestin/p/12911418.html
Copyright © 2011-2022 走看看