zoukankan      html  css  js  c++  java
  • Python 统计文本中单词的个数

    1.读文件,通过正则匹配

     1 def statisticWord():
     2     line_number = 0
     3     words_dict = {}
     4     with open (r'D:	est	est.txt',encoding='utf-8') as a_file:
     5         for line in a_file:
     6             words = re.findall(r'&#d+;|&#d+;|&w+;',line)
     7             for word in words:
     8                 words_dict[word] = words_dict.get(word,0) + 1 #get the value of word, default is 0
     9         sort_words_dict = OrderedDict(sorted(words_dict.items(),key = lambda x : x[1], reverse = True))
    10 #        sort_words_dict = sorted(words_dict, key = operator.itemgetter(1))
    11         with open(r'D:	estoutput.txt',encoding = 'utf-8', mode='w') as b_file:
    12             for k,v in sort_words_dict.items():
    13                 b_file.write("%-15s:%15s" % (k,v))
    14                 b_file.write('
    ')

    2. 通过命令行参数

    def statisticWord2():
        if len(sys.argv) == 1 or sys.argv[1] in {"-h", "--help"}:
            print("usage: filename_1 filename_2 ... filename_n")
            sys.exit()
        else:
            words = {}
            strip = string.whitespace + string.punctuation + string.digits + ""'"
            for filename in sys.argv[1:]:
                for line in open(filename):
                    for word in line.split():
                        word = word.strip(strip) # remove all the combination of strip in prefix or suffix
                        if len(word) >= 2:
                            words[word] = words.get(word, 0) + 1
            for word in sorted(words):
                print("'{0}' occurs {1} times".format(word,words[word]))
  • 相关阅读:
    处理图片
    打死都不放手
    美白
    词云
    太阳花绘制
    测评软件Lemon教程
    --解释?说明:--
    T1 数字配对 题解
    P1100 高低位交换
    P1143 进制转换
  • 原文地址:https://www.cnblogs.com/zyf7630/p/3209976.html
Copyright © 2011-2022 走看看