zoukankan      html  css  js  c++  java
  • 爬虫大作业

    import jieba.analyse
    from PIL import Image,ImageSequence
    import numpy as np
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud,ImageColorGenerator
    import requests
    from urllib import parse
    from bs4 import BeautifulSoup
    
    def getWord():
        lyric = ''
        # 打开文档,进行编译,防止错误
        f = open('youku.txt', 'r', encoding='utf-8')
        # 将文档里面的数据进行单个读取,便于生成词云
        for i in f:
            lyric += f.read()
        #     进行分析
        result = jieba.analyse.textrank(lyric, topK=50, withWeight=True)
        keywords = dict()
        for i in result:
            keywords[i[0]] = i[1]
        print(keywords)
    
        # 获取词云生成所需要的模板图片
        image = Image.open('789.jpg')
        graph = np.array(image)
        # 进行词云的设置
        wc = WordCloud(font_path='./fonts/simhei.ttf', background_color='White', max_words=50, mask=graph)
        wc.generate_from_frequencies(keywords)
        image_color = ImageColorGenerator(graph)
        plt.imshow(wc)
        plt.imshow(wc.recolor(color_func=image_color))
        plt.axis("off")
        plt.show()
        wc.to_file('dream.png')
    
    name = 'youku'
    unique = parse.quote(name)
    print(unique)
    url = 'http://list.youku.com/category/show/c_96_g_%E7%A7%91%E5%B9%BB_s_1_d_1.html?spm=a2hmv.20009921.m_86982.5~5~5!3~1~3!5~A'
    print(url)
    
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    titles = soup.select(".info-list .title a")
    for i in range(0,len(titles)):
        title = titles[i].text
        f = open('youku.txt', 'a', encoding='utf-8')
        f.write(title)
        f.write("
    ")
        f.close()
        # print(title)
    getWord()

  • 相关阅读:
    String ,StringBuilder, StringBuffer
    apt-get方式删除软件
    curl命令的使用
    maven自动部署测试Web应用
    几个重要的maven命令
    linux中默认jdk的配置
    登录注册的页面制作
    运用php做投票题,例题
    复选框式查询 例题租房子
    会话用法 和留言板例题
  • 原文地址:https://www.cnblogs.com/darkhate/p/8922674.html
Copyright © 2011-2022 走看看