zoukankan      html  css  js  c++  java
  • 爬虫大作业(虎扑足球新闻)

    import requests
    from bs4 import BeautifulSoup
    import jieba
    from PIL import Image,ImageSequence
    import numpy as np
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud,ImageColorGenerator
    def changeTitleToDict():
        f = open('yingchao.txt', 'r',encoding='utf-8')
        str = f.read()
        stringList = list(jieba.cut(str))
        symbol = {"/", "(", ")" , " ", ";", "!", "、" ,  ":"}
        stringSet = set(stringList) - symbol
        title_dict = {}
        for i in stringSet:
            title_dict[i] = stringList.count(i)
        print(title_dict)
        return title_dict
    for i in range(1,10):
            page = i;
            hupu = 'https://voice.hupu.com/soccer/tag/496-%s.html' % (page)
            reslist = requests.get(hupu)
            reslist.encoding = 'utf-8'
            soup_list = BeautifulSoup(reslist.text, 'html.parser')
            for news in soup_list.find_all('span',class_='n1'):
                print(news.text)
                f = open('yingchao.txt', 'a', encoding='utf-8')
                f.write(news.text)
                f.close()
    
    title_dict = changeTitleToDict()
    font = r'C:WindowsFontssimhei.ttf'
    content = ' '.join(title_dict.keys())
    # 根据图片生成词云
    image = np.array(Image.open('1.jpg'))
    wordcloud = WordCloud(background_color='white', font_path=font, mask=image, width=1000, height=860, margin=2).generate(content)
    #字体颜色
    image2 = np.array(Image.open('2.jpg'))
    iamge_colors = ImageColorGenerator(image2)
    wordcloud.recolor(color_func=iamge_colors)
    
    # 显示生成的词云
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    wordcloud.to_file('3.jpg')
    

    背景图

    字体颜色图

    词云图

  • 相关阅读:
    T-SQL逻辑查询的各阶段
    linqtosql(GroupBy/Having/Count/Sum/Min/Max/Avg操作符)
    jquery获取select下拉框的值以及change时间绑定
    英汉翻译功能的简单实现
    jQuery zTree默认加载一级几点
    Linq第一篇
    vs2015+win10搭开发环境
    asp.net mvc 中使用静态页
    .net中的异步操作
    (转载)TFS2013安装+配置
  • 原文地址:https://www.cnblogs.com/Lorz/p/8969234.html
Copyright © 2011-2022 走看看