zoukankan      html  css  js  c++  java
  • 【python】入门学习(十)

    #入门学习系列的内容均是在学习《Python编程入门(第3版)》时的学习笔记

    统计一个文本文档的信息,并输出出现频率最高的10个单词

    #text.py
    #保留的字符
    keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'
            'q','r','s','t','u','v','w','x','y','z',' ','-',"'"}
    #将文本规范化 
    def normalize(s): 
        """Convert s to a normalized string."""
        result = ''
        for c in s.lower():
            if c in keep:
                result += c
        return result
    
    #获取文本基本信息
    def file_stats(fname):
        """Print statistics for the given file."""
        s = open(fname,'r').read()
        num_chars = len(s)
        num_lines = s.count('
    ')
        num_words = len(normalize(s).split())
        print("The file %s has:" % fname)
        print("  %s characters" % num_chars)
        print("  %s lines" % num_lines)
        print("  %s words" % num_words)
    
    #将字符串转化为字典
    def make_freq_dict(s):
        """Return a dictionary whose keys are the words of s,and whose values are the counts of those words."""
        s = normalize(s)
        words = s.split()
        d = {}
        for w in words:
            if w in d:
                d[w] += 1
            else:
                d[w] = 1
        return d
    
    #获取文本基本信息
    def file_stats2(fname):
        """Print statistics for the given file."""
        s = open(fname,'r').read()
        num_chars = len(s)
        num_lines = s.count('
    ')
        d = make_freq_dict(s)
        num_words = sum(d[w] for w in d)
        lst = [(d[w],w) for w in d]
        lst.sort()
        lst.reverse()
        print("The file %s has:" % fname)
        print("  %s characters" % num_chars)
        print("  %s lines" % num_lines)
        print("  %s words" % num_words)
        print("
    The top 10 most frequent words are:")
        i = 1
        for count,word in lst[:99]:
            print('%2s. %4s %s' % (i, count, word))
            i += 1
    >>> file_stats2('a.txt')
    The file a.txt has:
      12927 characters
      297 lines
      1645 words
    
    The top 10 most frequent words are:
     1.   62 to
     2.   62 the
     3.   47 is
     4.   42 a
     5.   41 of
     6.   40 it
     7.   36 that
     8.   35 and
     9.   32 as
    10.   24 so

    进一步完善的代码:

    #text.py
    #保留的字符
    keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'
            'q','r','s','t','u','v','w','x','y','z',' ','-',"'"}
    #将文本规范化 
    def normalize(s): 
        """Convert s to a normalized string."""
        result = ''
        for c in s.lower():
            if c in keep:
                result += c
        return result
    
    #获取文本基本信息
    def file_stats(fname):
        """Print statistics for the given file."""
        s = open(fname,'r').read()
        num_chars = len(s)
        num_lines = s.count('
    ')
        num_words = len(normalize(s).split())
        print("The file %s has:" % fname)
        print("  %s characters" % num_chars)
        print("  %s lines" % num_lines)
        print("  %s words" % num_words)
    
    #将字符串转化为字典
    def make_freq_dict(s):
        """Return a dictionary whose keys are the words of s,and whose values are the counts of those words."""
        s = normalize(s)
        words = s.split()
        d = {}
        for w in words:
            if w in d:
                d[w] += 1
            else:
                d[w] = 1
        return d
    
    #获取文本基本信息
    def file_stats2(fname):
        """Print statistics for the given file."""
        s = open(fname,'r').read()
        num_chars = len(s)
        num_lines = s.count('
    ')
        d = make_freq_dict(s)
        num_different_words = sum(d[w]/d[w] for w in d)
        num_words = sum(d[w] for w in d)
        words_average_length = sum(len(w) for w in d)/num_different_words
        num_once = sum(d[w] for w in d if d[w] == 1)
        lst = [(d[w],w) for w in d]
        lst.sort()
        lst.reverse()
        print("The file %s has:" % fname)
        print("  %s characters" % num_chars)
        print("  %s lines" % num_lines)
        print("  %s words" % num_words)
        print("  %s words appreance one time" % num_once)
        print("  %s different words" % int(num_different_words))
        print("  %s average length" % words_average_length)
        print("
    The top 10 most frequent words are:")
        i = 1
        for count,word in lst[:10]:
            print('%2s. %4s %s' % (i, count, word))
            i += 1
    
    def main():
        file_stats2('a.txt')
    
    if __name__=='__main__':
        main()
    >>> ================================ RESTART ================================
    >>> 
    The file a.txt has:
      12927 characters
      297 lines
      1645 words
      515 words appreance one time
      699 different words
      6.539341917024321 average length
    
    The top 10 most frequent words are:
     1.   62 to
     2.   62 the
     3.   47 is
     4.   42 a
     5.   41 of
     6.   40 it
     7.   36 that
     8.   35 and
     9.   32 as
    10.   24 so
  • 相关阅读:
    复合优先于继承
    在共有类中使用访问方法而非共有域
    在共有类中使用访问方法而非共有域
    复合优先于继承
    矩阵求导(转)
    machine learning
    矩阵求导(转)
    使可变性最小化
    machine learning
    使可变性最小化
  • 原文地址:https://www.cnblogs.com/dplearning/p/3956242.html
Copyright © 2011-2022 走看看