zoukankan      html  css  js  c++  java
  • 词频统计(网易新闻)

    import os
    import re
    import jieba
    import requests
    
    if not os.path.exists('网易新闻'):
        os.mkdir('网易新闻')
    
    count = 0
    str_ = ''
    
    for i in ['nba']:
        # 获取所有的url
        response = requests.get(f'https://sports.163.com/{i}/')
        data = response.text
        url_res = re.findall('href="(https://sports.163.com/.*?)"', data)
        url_res = set(url_res)
    
        # 针对单个url
    
        for url in url_res:
            url_response = requests.get(url)
            url_data = url_response.text
    
            try:
                title = re.findall('<h1>(.*?)</h1>', url_data, re.S)[0]
                news_res = 
                    re.findall(
                        '<div class="post_text" id="endText" style="border-top:1px solid #ddd;">(.*?责任编辑:.*?)</span>',
                        url_data, re.S)[0]  #
                news_res = re.sub('<.*?>', '', news_res)
            except:
                continue
    
            title = re.sub('[!"#$%&()*+,-./:;<=>?@[\]^_‘{|}~,…]|s', '', title)  # 除掉标题所有的脏字符
            title_path = os.path.join('网易新闻', f'{title}.txt')  # 拼接出新闻的路径
            # f = open(title_path, 'w', encoding='utf8')
            # f.write(news_res)
            # f.flush()
            # f.close()
            count += 1
    
            str_ += news_res
    
            print(f'完成{count}篇, {title} done...')
    
    res = jieba.lcut(str_)
    dic = {}
    for i in res:
        if len(i) == 1:
            continue
        if i not in dic:
            dic[i] = 1
        else:
            dic[i] += 1
    
    dic_list = list(dic.items())
    
    def func(i):
        return i[1]
    
    dic_list.sort(key = func)
    dic_list.reverse()
    
    new_str = ''
    for i in dic_list[:20]:
        new_str += f'{i[0]},'
        print(i)
    
    import wordcloud
    
    w = wordcloud.WordCloud(font_path=r'C:WindowsFonts等线Deng')
    w.generate(new_str)
    w.to_file('网易新闻.png')
    
  • 相关阅读:
    【python-opencv】opencv基础操作之一
    【胎教】做AI的基础,开始学习。
    【实习】博士生找实习的囧事之其一
    【经验】CS
    【keras】用tensorboard监视CNN每一层的输出
    【算法】背包九讲
    【计算机网络】大数据 云计算 人工智能
    【算法】shortest distance
    【git】git hello world
    【算法】深度优先 马走日 Hamilton routes
  • 原文地址:https://www.cnblogs.com/yushan1/p/11232397.html
Copyright © 2011-2022 走看看