zoukankan      html  css  js  c++  java
  • 统计词频

    
    
    import re
    from collections import Counter
    
    string = """   Lorem ipsum dolor sit amet, consectetur
        adipiscing elit. Nunc ut elit id mi ultricies
        adipiscing. Nulla facilisi. Praesent pulvinar,
        sapien vel feugiat vestibulum, nulla dui pretium orci,
        non ultricies elit lacus quis ante. Lorem ipsum dolor
        sit amet, consectetur adipiscing elit. Aliquam
        pretium ullamcorper urna quis iaculis. Etiam ac massa
        sed turpis tempor luctus. Curabitur sed nibh eu elit
        mollis congue. Praesent ipsum diam, consectetur vitae
        ornare a, aliquam a nunc. In id magna pellentesque
        tellus posuere adipiscing. Sed non mi metus, at lacinia
        augue. Sed magna nisi, ornare in mollis in, mollis
        sed nunc. Etiam at justo in leo congue mollis.
        Nullam in neque eget metus hendrerit scelerisque
        eu non enim. Ut malesuada lacus eu nulla bibendum
        id euismod urna sodales.  """
    
    words = re.findall(r'w+', string) #This finds words in the document
    
    lower_words = [word.lower() for word in words] #lower all the words
    
    word_counts = Counter(lower_words) #counts the number each time a word appears
    print word_counts
    
    # Counter({'elit': 5, 'sed': 5, 'in': 5, 'adipiscing': 4, 'mollis': 4, 'eu': 3, 
    # 'id': 3, 'nunc': 3, 'consectetur': 3, 'non': 3, 'ipsum': 3, 'nulla': 3, 'pretium':
    # 2, 'lacus': 2, 'ornare': 2, 'at': 2, 'praesent': 2, 'quis': 2, 'sit': 2, 'congue': 2, 'amet': 2, 
    # 'etiam': 2, 'urna': 2, 'a': 2, 'magna': 2, 'lorem': 2, 'aliquam': 2, 'ut': 2, 'ultricies': 2, 'mi': 2, 
    # 'dolor': 2, 'metus': 2, 'ac': 1, 'bibendum': 1, 'posuere': 1, 'enim': 1, 'ante': 1, 'sodales': 1, 'tellus': 1,
    # 'vitae': 1, 'dui': 1, 'diam': 1, 'pellentesque': 1, 'massa': 1, 'vel': 1, 'nullam': 1, 'feugiat': 1, 'luctus': 1, 
    # 'pulvinar': 1, 'iaculis': 1, 'hendrerit': 1, 'orci': 1, 'turpis': 1, 'nibh': 1, 'scelerisque': 1, 'ullamcorper': 1,
    # 'eget': 1, 'neque': 1, 'euismod': 1, 'curabitur': 1, 'leo': 1, 'sapien': 1, 'facilisi': 1, 'vestibulum': 1, 'nisi': 1, 
    # 'justo': 1, 'augue': 1, 'tempor': 1, 'lacinia': 1, 'malesuada': 1})
    
    
    
    
    
  • 相关阅读:
    Core Data入门
    web前端开发与iOS终端开发的异同
    Blocks编程
    ARC下循环引用的问题
    小项目
    error: /Users/**/Documents/workspace/***/clean_right_normal.png: No such file or directory
    修复NSTextAlignmentCenter引起的警告
    Implicit conversion loses integer precision: 'long long' to 'NSUInteger' (aka 'unsigned int')
    手势相关函数
    implicit declaration of function setxattr is invalid in c99
  • 原文地址:https://www.cnblogs.com/tingshuo123/p/6917817.html
Copyright © 2011-2022 走看看