zoukankan      html  css  js  c++  java
  • 爬虫大作业

    import jieba.analyse
    from PIL import Image,ImageSequence
    import numpy as np
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud,ImageColorGenerator
    import requests
    from urllib import parse
    from bs4 import BeautifulSoup
    
    def getWord():
        lyric = ''
        # 打开文档,进行编译,防止错误
        f = open('youku.txt', 'r', encoding='utf-8')
        # 将文档里面的数据进行单个读取,便于生成词云
        for i in f:
            lyric += f.read()
        #     进行分析
        result = jieba.analyse.textrank(lyric, topK=50, withWeight=True)
        keywords = dict()
        for i in result:
            keywords[i[0]] = i[1]
        print(keywords)
    
        # 获取词云生成所需要的模板图片
        image = Image.open('789.jpg')
        graph = np.array(image)
        # 进行词云的设置
        wc = WordCloud(font_path='./fonts/simhei.ttf', background_color='White', max_words=50, mask=graph)
        wc.generate_from_frequencies(keywords)
        image_color = ImageColorGenerator(graph)
        plt.imshow(wc)
        plt.imshow(wc.recolor(color_func=image_color))
        plt.axis("off")
        plt.show()
        wc.to_file('dream.png')
    
    name = 'youku'
    unique = parse.quote(name)
    print(unique)
    url = 'http://list.youku.com/category/show/c_96_g_%E7%A7%91%E5%B9%BB_s_1_d_1.html?spm=a2hmv.20009921.m_86982.5~5~5!3~1~3!5~A'
    print(url)
    
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    titles = soup.select(".info-list .title a")
    for i in range(0,len(titles)):
        title = titles[i].text
        f = open('youku.txt', 'a', encoding='utf-8')
        f.write(title)
        f.write("
    ")
        f.close()
        # print(title)
    getWord()

  • 相关阅读:
    css基础1
    js基础
    定位与浮动
    最新学习
    前端初学第一天
    js2
    js1
    html加css
    js三元表达式
    java script的学习
  • 原文地址:https://www.cnblogs.com/darkhate/p/8922674.html
Copyright © 2011-2022 走看看