zoukankan      html  css  js  c++  java
  • 爬虫大作业

    import jieba.analyse
    from PIL import Image,ImageSequence
    import numpy as np
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud,ImageColorGenerator
    import requests
    from urllib import parse
    from bs4 import BeautifulSoup
    
    def getWord():
        lyric = ''
        # 打开文档,进行编译,防止错误
        f = open('youku.txt', 'r', encoding='utf-8')
        # 将文档里面的数据进行单个读取,便于生成词云
        for i in f:
            lyric += f.read()
        #     进行分析
        result = jieba.analyse.textrank(lyric, topK=50, withWeight=True)
        keywords = dict()
        for i in result:
            keywords[i[0]] = i[1]
        print(keywords)
    
        # 获取词云生成所需要的模板图片
        image = Image.open('789.jpg')
        graph = np.array(image)
        # 进行词云的设置
        wc = WordCloud(font_path='./fonts/simhei.ttf', background_color='White', max_words=50, mask=graph)
        wc.generate_from_frequencies(keywords)
        image_color = ImageColorGenerator(graph)
        plt.imshow(wc)
        plt.imshow(wc.recolor(color_func=image_color))
        plt.axis("off")
        plt.show()
        wc.to_file('dream.png')
    
    name = 'youku'
    unique = parse.quote(name)
    print(unique)
    url = 'http://list.youku.com/category/show/c_96_g_%E7%A7%91%E5%B9%BB_s_1_d_1.html?spm=a2hmv.20009921.m_86982.5~5~5!3~1~3!5~A'
    print(url)
    
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    titles = soup.select(".info-list .title a")
    for i in range(0,len(titles)):
        title = titles[i].text
        f = open('youku.txt', 'a', encoding='utf-8')
        f.write(title)
        f.write("
    ")
        f.close()
        # print(title)
    getWord()

  • 相关阅读:
    drf 三大认证详解
    管理表页面的创建
    电脑自动关机设置
    jwt 认证规则
    视图家族练习
    JQuery 数组获取和删除元素
    JQurey 添加和删除元素
    Java 占位符
    Redis
    线程
  • 原文地址:https://www.cnblogs.com/darkhate/p/8922674.html
Copyright © 2011-2022 走看看