zoukankan html css js c++ java

爬虫大作业

import requests
from bs4 import BeautifulSoup
import jieba
import matplotlib.pyplot as plt
from scipy.misc import imread
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


def get_url(urls):
    for n in range(0, 100):
        url = 'https://news.cnblogs.com/n/page/' + str(n) + '/'
        urls.append(url)
    return urls


def get_info(url, content):
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    news = soup.select('div[class="content"] h2')
    for nn in news:
        content.append(nn.get_text().strip('
').split('
'))
    return content


urls = []
single_content =[]
all_content = []
# urls = get_url(urls)
# for u in urls:
#     all_content.append(get_info(u, single_content))
# name = open('blog.txt', 'w', encoding='utf-8')
# for cc in all_content[0]:
#     name.write(str(cc[0]) + '
')
# name.close()


def jieba_split():
    with open('blog.txt', encoding='utf-8') as f:
        comment_text = f.read()
    cut_text = " ".join(jieba.cut(comment_text))
    with open('blog_split.txt', 'w', encoding='utf-8') as f:
        f.write(cut_text)

# jieba_split()

def wordcouter():
    word_lists = []
    with open('blog_split.txt', 'r', encoding='utf-8') as f:
        words = f.readlines()
        for ww in words:
             s_word= list(jieba.cut(ww))
             for word in s_word:
                 word_lists.append(word)

    word_lists_set = set(list(word_lists))
    length = len(word_lists_set)
    k = 1
    couter = []
    for w in word_lists_set:
        couter.append(w + u':' + str(word_lists.count(w)) + u"次
")
        k += 1
    with open('counter.txt', 'w', encoding='utf-8') as f:
        f.writelines(couter)
# wordcouter()

def word_cloud():
    s_words = open('counter.txt', 'r', encoding='utf-8').read()
    words = jieba.cut(s_words, cut_all=True)
    words_split = " ".join(words)
    print(words_split)
    background_pic = imread('hellokity.JPG')
    word_c = WordCloud(
        width=1000,
        height=1000,
        margin=2,
        background_color='white',
        mask=background_pic,
        font_path='C:WindowsFontsSTZHONGS.TTF',
        stopwords=STOPWORDS,
        max_font_size=100,
        random_state=100
    )
    word_c.generate_from_text(words_split)
    word_c.to_file('kity.JPG')

word_cloud()

查看全文

相关阅读:
express，中间件(body-parser），req.body获取不到参数(含postman发请求的方法)
echarts+百度地图+vue 填坑记（一）（百度地图、鼠标移入移出标注,信息框会产生闪烁）
echarts tooltip提示框自定义小圆点(颜色、形状和大小等等)
百度2019校招Web前端工程师笔试卷（9月14日）
python基础学习
 使用javascript模拟常见数据结构(四)
使用javascript模拟常见数据结构（三）
使用javascript模拟常见数据结构（二）
mongodb的安装与增删改查
 使用javascript模拟常见数据结构（一）

原文地址：https://www.cnblogs.com/severusandsusa/p/8934009.html