zoukankan      html  css  js  c++  java
  • 10.15作业

    1.英文

    #读取
    with open('steve.txt','r',encoding='utf-8')as f:
        novel = f.read()
    
    #清洗数据
        sep = " .!@#%&*;:',.?/_“’”"
        for ch in sep:
            novel=novel.replace(ch,' ')
    
    #字母换成小写
        novel = novel.lower()
    strnovel = novel.split()
    print(strnovel,len(strnovel))
    
    #分词后转为集合
    strset = set(strnovel)
    noMean = {'is','and','a','this','the','a','in','at','on','to','s','his','3','1983'}
    strset = strset - noMean
    print(strset,len(strset))
    
    #将集合中词统计出现次数
    strdict={}
    for word in strset:
        strdict[word] = strnovel.count(word)
    print(strdict,len(strdict))
    wordlist = list(strdict.items())
    
    #排序
    wordlist.sort(key=lambda x:x[1],reverse=True)
    print(wordlist)
    
    #输入TOP20
    for i in range(20):
        print(wordlist[i])
    

     

    2.中文小说

    #-*- coding:utf-8 -*-
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud
    import jieba
    
    
    
    with open('doupo.txt','r',encoding='utf-8') as f:
        doupo = f.read()
    
    #清洗
    sep = " ,.?;:'!*#-_"
    for quchu in sep:
        doupo = doupo.replace(quchu,' ')
    #分词
    wordList = jieba.cut(doupo)
    print(type(wordList))
    #词频分析
    data={}
    for word in wordList:
        if len(word) == 1:
            continue
        else:data[word] = data.get(word,0)+1
    
    result = list(data.items())
    result.sort(key=lambda x:x[1],reverse=True)
    for top_20 in range(20):
        print(result[top_20])
    
    wordSplit = " ".join(dict(result))
    wc = WordCloud(background_color="black",  # 设置背景颜色
                   # mask = "图片",  #设置背景图片
                   max_words=2000,  # 设置最大显示的字数
                   # stopwords = "", #设置停用词
                   font_path="‪C:\Windows\Fonts\NotoSansHans-Black_0.otf",
                   # 设置中文字体,使得词云可以显示(词云默认字体是“DroidSansMono.ttf字体库”,不支持中文)
                   max_font_size=40,
                  # 设置字体最大值
                   random_state=30,  # 设置有多少种随机生成状态,即有多少种配色方案
                   )
    mywc = wc.generate(wordSplit)  # 生成词云
    
    # 展示词云图
    plt.imshow(mywc)
    plt.axis("off")
    plt.show()
    wc.to_file('myword.jpg')  # 保存图片文件
    

      

  • 相关阅读:
    ant实践总结
    Ubuntu安装JDK和Eclipse
    Xp下安装Ubuntu
    [Hadoop] 传智播客新版Hadoop视频教程 段海涛老师Hadoop八天完全攻克Hadoop视频教程 Hadoop开发
    Ctypes简单应用以及PyQt5等简单使用过程中遇到的部分问题
    python初探:关于闭包
    利用Nginx+Frp搭建HTTP正向代理
    C#发送邮件时提示:“不允许使用邮箱名称。服务器响应为:”的错误解决办法
    提倡IT从业人员终身学习
    打算在博客园安家了
  • 原文地址:https://www.cnblogs.com/czx98/p/9790099.html
Copyright © 2011-2022 走看看