zoukankan      html  css  js  c++  java
  • 爬虫大作业

    import requests
    from bs4 import BeautifulSoup
    import jieba
    import matplotlib.pyplot as plt
    from scipy.misc import imread
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    
    
    def get_url(urls):
        for n in range(0, 100):
            url = 'https://news.cnblogs.com/n/page/' + str(n) + '/'
            urls.append(url)
        return urls
    
    
    def get_info(url, content):
        res = requests.get(url)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        news = soup.select('div[class="content"] h2')
        for nn in news:
            content.append(nn.get_text().strip('
    ').split('
    '))
        return content
    
    
    urls = []
    single_content =[]
    all_content = []
    # urls = get_url(urls)
    # for u in urls:
    #     all_content.append(get_info(u, single_content))
    # name = open('blog.txt', 'w', encoding='utf-8')
    # for cc in all_content[0]:
    #     name.write(str(cc[0]) + '
    ')
    # name.close()
    
    
    def jieba_split():
        with open('blog.txt', encoding='utf-8') as f:
            comment_text = f.read()
        cut_text = " ".join(jieba.cut(comment_text))
        with open('blog_split.txt', 'w', encoding='utf-8') as f:
            f.write(cut_text)
    
    # jieba_split()
    
    def wordcouter():
        word_lists = []
        with open('blog_split.txt', 'r', encoding='utf-8') as f:
            words = f.readlines()
            for ww in words:
                 s_word= list(jieba.cut(ww))
                 for word in s_word:
                     word_lists.append(word)
    
        word_lists_set = set(list(word_lists))
        length = len(word_lists_set)
        k = 1
        couter = []
        for w in word_lists_set:
            couter.append(w + u':' + str(word_lists.count(w)) + u"")
            k += 1
        with open('counter.txt', 'w', encoding='utf-8') as f:
            f.writelines(couter)
    # wordcouter()
    
    def word_cloud():
        s_words = open('counter.txt', 'r', encoding='utf-8').read()
        words = jieba.cut(s_words, cut_all=True)
        words_split = " ".join(words)
        print(words_split)
        background_pic = imread('hellokity.JPG')
        word_c = WordCloud(
            width=1000,
            height=1000,
            margin=2,
            background_color='white',
            mask=background_pic,
            font_path='C:WindowsFontsSTZHONGS.TTF',
            stopwords=STOPWORDS,
            max_font_size=100,
            random_state=100
        )
        word_c.generate_from_text(words_split)
        word_c.to_file('kity.JPG')
    
    word_cloud()

  • 相关阅读:
    Struts2拦截器
    struts2介绍
    java读写文件大全
    Intent的详细解析以及用法
    sigmoid和softmax的应用意义区别
    C 实现 创建多个txt文件,并以自然数列命名,然后将产生的十进制数据写入txt文档
    k-means原理和python代码实现
    非极大值抑制 NMS
    JetSonNano darknet yolov3工程通过CMakeLists.txt配置编译环境
    C文件 CMakeList.txt编译器配置错误的问题 error:invalid conversion from 'int' to 'LAYER_TYPE' [-fpermissive]....
  • 原文地址:https://www.cnblogs.com/severusandsusa/p/8934009.html
Copyright © 2011-2022 走看看