zoukankan      html  css  js  c++  java
  • 爬虫大作业

    import requests
    from bs4 import BeautifulSoup
    import jieba
    import matplotlib.pyplot as plt
    from scipy.misc import imread
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    
    
    def get_url(urls):
        for n in range(0, 100):
            url = 'https://news.cnblogs.com/n/page/' + str(n) + '/'
            urls.append(url)
        return urls
    
    
    def get_info(url, content):
        res = requests.get(url)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        news = soup.select('div[class="content"] h2')
        for nn in news:
            content.append(nn.get_text().strip('
    ').split('
    '))
        return content
    
    
    urls = []
    single_content =[]
    all_content = []
    # urls = get_url(urls)
    # for u in urls:
    #     all_content.append(get_info(u, single_content))
    # name = open('blog.txt', 'w', encoding='utf-8')
    # for cc in all_content[0]:
    #     name.write(str(cc[0]) + '
    ')
    # name.close()
    
    
    def jieba_split():
        with open('blog.txt', encoding='utf-8') as f:
            comment_text = f.read()
        cut_text = " ".join(jieba.cut(comment_text))
        with open('blog_split.txt', 'w', encoding='utf-8') as f:
            f.write(cut_text)
    
    # jieba_split()
    
    def wordcouter():
        word_lists = []
        with open('blog_split.txt', 'r', encoding='utf-8') as f:
            words = f.readlines()
            for ww in words:
                 s_word= list(jieba.cut(ww))
                 for word in s_word:
                     word_lists.append(word)
    
        word_lists_set = set(list(word_lists))
        length = len(word_lists_set)
        k = 1
        couter = []
        for w in word_lists_set:
            couter.append(w + u':' + str(word_lists.count(w)) + u"")
            k += 1
        with open('counter.txt', 'w', encoding='utf-8') as f:
            f.writelines(couter)
    # wordcouter()
    
    def word_cloud():
        s_words = open('counter.txt', 'r', encoding='utf-8').read()
        words = jieba.cut(s_words, cut_all=True)
        words_split = " ".join(words)
        print(words_split)
        background_pic = imread('hellokity.JPG')
        word_c = WordCloud(
            width=1000,
            height=1000,
            margin=2,
            background_color='white',
            mask=background_pic,
            font_path='C:WindowsFontsSTZHONGS.TTF',
            stopwords=STOPWORDS,
            max_font_size=100,
            random_state=100
        )
        word_c.generate_from_text(words_split)
        word_c.to_file('kity.JPG')
    
    word_cloud()

  • 相关阅读:

    Android自己主动化測试之Monkeyrunner用法及实例
    递归函数的概念使用方法与实例
    正则、grep、sed、awk
    我的java学习笔记(一)
    mysql经常使用命令总结
    JSP动作--JSP有三种凝视方式
    http长连接和短连接
    StirngUtil工具类 之 邮箱注冊 域名不区分大写和小写方法
    在Eclipse上搭建Cocos2d-x的Android开发环境
  • 原文地址:https://www.cnblogs.com/severusandsusa/p/8934009.html
Copyright © 2011-2022 走看看