zoukankan      html  css  js  c++  java
  • 10.15作业

    1.英文

    #读取
    with open('steve.txt','r',encoding='utf-8')as f:
        novel = f.read()
    
    #清洗数据
        sep = " .!@#%&*;:',.?/_“’”"
        for ch in sep:
            novel=novel.replace(ch,' ')
    
    #字母换成小写
        novel = novel.lower()
    strnovel = novel.split()
    print(strnovel,len(strnovel))
    
    #分词后转为集合
    strset = set(strnovel)
    noMean = {'is','and','a','this','the','a','in','at','on','to','s','his','3','1983'}
    strset = strset - noMean
    print(strset,len(strset))
    
    #将集合中词统计出现次数
    strdict={}
    for word in strset:
        strdict[word] = strnovel.count(word)
    print(strdict,len(strdict))
    wordlist = list(strdict.items())
    
    #排序
    wordlist.sort(key=lambda x:x[1],reverse=True)
    print(wordlist)
    
    #输入TOP20
    for i in range(20):
        print(wordlist[i])
    

     

    2.中文小说

    #-*- coding:utf-8 -*-
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud
    import jieba
    
    
    
    with open('doupo.txt','r',encoding='utf-8') as f:
        doupo = f.read()
    
    #清洗
    sep = " ,.?;:'!*#-_"
    for quchu in sep:
        doupo = doupo.replace(quchu,' ')
    #分词
    wordList = jieba.cut(doupo)
    print(type(wordList))
    #词频分析
    data={}
    for word in wordList:
        if len(word) == 1:
            continue
        else:data[word] = data.get(word,0)+1
    
    result = list(data.items())
    result.sort(key=lambda x:x[1],reverse=True)
    for top_20 in range(20):
        print(result[top_20])
    
    wordSplit = " ".join(dict(result))
    wc = WordCloud(background_color="black",  # 设置背景颜色
                   # mask = "图片",  #设置背景图片
                   max_words=2000,  # 设置最大显示的字数
                   # stopwords = "", #设置停用词
                   font_path="‪C:\Windows\Fonts\NotoSansHans-Black_0.otf",
                   # 设置中文字体,使得词云可以显示(词云默认字体是“DroidSansMono.ttf字体库”,不支持中文)
                   max_font_size=40,
                  # 设置字体最大值
                   random_state=30,  # 设置有多少种随机生成状态,即有多少种配色方案
                   )
    mywc = wc.generate(wordSplit)  # 生成词云
    
    # 展示词云图
    plt.imshow(mywc)
    plt.axis("off")
    plt.show()
    wc.to_file('myword.jpg')  # 保存图片文件
    

      

  • 相关阅读:
    C/C++中的abort、atexit、exit和_Exit
    从QQ聊天看交流的有效性
    HDU 1711 Number Sequence (KMP)
    pintos操作系统thread部分的实现
    搜狗面试的经典题(C++map按值排序,class struct的差别)
    SQL学习之使用order by 依照指定顺序排序或自己定义顺序排序
    选中多个或全中订单逻辑
    HDU 4415 Assassin's Creed(贪心)
    ashx文件中使用session提示“未将对象引用设置到对象的实例”
    .NET运行机制
  • 原文地址:https://www.cnblogs.com/czx98/p/9790099.html
Copyright © 2011-2022 走看看