zoukankan      html  css  js  c++  java
  • 爬取微博热搜榜 李白之死 Python

    最近有关中国传统文化的内容频频登上热搜,就比如最近的李白之死,今天换一种方式爬取,以前爬取微博评论是网址里一大串参数,今天把参数提出来做一个字典,然后请求的时候再构造url。

     1 """
     2     就爬取李白之死的评论
     3 """
     4 import requests
     5 import re
     6 import openpyxl as op
     7 
     8 wb = op.Workbook()
     9 ws = wb.create_sheet(index=0)
    10 # 表头
    11 ws.cell(row=1, column=1, value='评论者id') # 第一行第一列userId
    12 ws.cell(row=1, column=2, value='评论者昵称') # 第一行第一列userId
    13 ws.cell(row=1, column=3, value='获赞数') # 第一行第一列userId
    14 ws.cell(row=1, column=4, value='创建时间') # 第一行第一列userId
    15 ws.cell(row=1, column=5, value='评论内容') # 第一行第一列userId
    16 
    17 headers = {
    18     "cookie": "cookie",
    19     "referer": "https://m.weibo.cn/status/L690FmKXW?jumpfrom=weibocom",
    20     "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
    21 }
    22 page = 1
    23 while page < 100 + 1:
    24     url = 'https://m.weibo.cn/comments/hotflow'
    25     if page == 1:
    26         params = {
    27                     'id': '4714646055423756',
    28                     'mid': '4714646055423756',
    29                     'max_id_type':0,
    30         }
    31     else:
    32         params = {
    33                     'id': '4714646055423756',
    34                     'mid': '4714646055423756',
    35                     'max_id': max_id,
    36                     'max_id_type':max_id_type,
    37         }
    38     response = requests.get(url=url, headers=headers, params=params)
    39     max_id = response.json()['data']['max_id']
    40     max_id_type = response.json()['data']['max_id_type']
    41     results = response.json()['data']['data'] # 获取到评论列表
    42     for item in results:
    43         userId = item['user']['id']
    44         userName = item['user']['screen_name']
    45         likeCount = item['like_count']
    46         timeCreated = item['created_at'] # 创建时间
    47         commentContent = item['text'] # 评论内容
    48         print(userId, userName, likeCount, timeCreated, commentContent, response.url, sep=' | ')
    49         ws.append([userId, userName, likeCount, timeCreated, commentContent])
    50     page += 1
    51 
    52 wb.save('李白之死.xlsx')
    53 wb.close()

    保存方式有很多,前面也有过一篇关于python爬虫数据保存方式的。但是今天只要爬评论内容来做个词频。

     1 """
     2     就爬取李白之死的评论
     3 """
     4 import requests
     5 import re
     6 import openpyxl as op
     7 
     8 # wb = op.Workbook()
     9 # ws = wb.create_sheet(index=0)
    10 # 表头
    11 # ws.cell(row=1, column=1, value='评论者id') # 第一行第一列userId
    12 # ws.cell(row=1, column=2, value='评论者昵称') # 第一行第一列userId
    13 # ws.cell(row=1, column=3, value='获赞数') # 第一行第一列userId
    14 # ws.cell(row=1, column=4, value='创建时间') # 第一行第一列userId
    15 # ws.cell(row=1, column=5, value='评论内容') # 第一行第一列userId
    16 
    17 headers = {
    18     "cookie": "cookie",
    19     "referer": "https://m.weibo.cn/status/L690FmKXW?jumpfrom=weibocom",
    20     "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
    21 }
    22 page = 1
    23 while page < 100 + 1:
    24     url = 'https://m.weibo.cn/comments/hotflow'
    25     if page == 1:
    26         params = {
    27                     'id': '4714646055423756',
    28                     'mid': '4714646055423756',
    29                     'max_id_type':0,
    30         }
    31     else:
    32         params = {
    33                     'id': '4714646055423756',
    34                     'mid': '4714646055423756',
    35                     'max_id': max_id,
    36                     'max_id_type':max_id_type,
    37         }
    38     response = requests.get(url=url, headers=headers, params=params)
    39     max_id = response.json()['data']['max_id']
    40     max_id_type = response.json()['data']['max_id_type']
    41     results = response.json()['data']['data'] # 获取到评论列表
    42     for item in results:
    43         """
    44         这一次只爬评论内容
    45         """
    46         commentContent = re.sub(r'<[^>]*>', '', item['text']) # 将评论内容里的特殊字符用正则替换掉
    47         print(commentContent)
    48         with open('libazhisi.txt', mode='a', encoding='utf-8') as f:
    49             f.write(f'{commentContent}\n') # 换行写入
    50     page += 1
    51 
    52 # wb.save('李白之死.xlsx')
    53 # wb.close()

    词频展示:

     1 """
     2     做个词频
     3 """
     4 # 打开文档
     5 import re
     6 from collections import Counter
     7 import jieba
     8 from pyecharts.charts import Bar
     9 import pyecharts.options as opts
    10 from pyecharts.globals import ThemeType
    11 
    12 def replaceSth(sth):
    13     pattern = re.compile(r'[a-zA-Z0-9…,\@”![\\]_]。')
    14     new = re.sub(pattern, '', sth)
    15     return new
    16 
    17 with open('李白之死.txt', mode='r', encoding='utf-8') as f:
    18     reader = f.read()
    19     new_reader = re.sub('[”0-9a-zA-Z!?,\@\-\_/\[\]\\...\\#【】\*\…。\“]', '', reader)
    20     # print(new_reader)
    21 # 打开停用此表
    22 with open('stopwords.txt', mode='r', encoding='utf-8') as f:
    23     stopReader = f.read()
    24 word = jieba.cut(new_reader) # 分词
    25 words = [] # 空list
    26 for item in list(word): # 遍历
    27     if item not in stopReader: # 条件判断
    28         words.append(item) # 追加列表
    29 
    30 # print(dict(Counter(words))) # 打印分词
    31 # 开始画图
    32 label = [] # x轴标签
    33 data = [] # y轴数据
    34 for k, v in dict(Counter(words).most_common(10)).items(): # 取字典里常出现的十个
    35     label.append(k)
    36     data.append(v)
    37 
    38 bar = (
    39     Bar(init_opts=opts.InitOpts(width='1080px', height='960px', theme=ThemeType.LIGHT))
    40     .add_xaxis(label)
    41     .add_yaxis(series_name='', y_axis=data)
    42     .set_global_opts(
    43         title_opts=opts.TitleOpts(
    44             title='李白之死词频',
    45             title_textstyle_opts=opts.TextStyleOpts(font_size=30),),
    46         toolbox_opts=opts.ToolboxOpts(),
    47         tooltip_opts=opts.TooltipOpts(
    48             is_show=True,
    49             trigger='axis',
    50             axis_pointer_type='cross',
    51         ),
    52     )
    53 ).render('词频.html')

     

  • 相关阅读:
    深度学习中的激活函数
    23.从上往下打印二叉树
    22.栈的压入、弹出序列
    使用TensorFlow实现DNN
    shell按行读取文件
    linux集群批量执行命令
    CDH升级
    自动微分方法(auto diff)
    快速了解负载均衡
    拼写纠错的利器,BK树算法
  • 原文地址:https://www.cnblogs.com/mafu/p/15711438.html
Copyright © 2011-2022 走看看