zoukankan      html  css  js  c++  java
  • 爬取微博

    1. 

     1 import requests
     2 from urllib.parse import urlencode
     3 from pyquery import PyQuery as pq
     4 from pymongo import MongoClient
     5 
     6 base_url = 'https://m.weibo.cn/api/container/getIndex?'
     7 headers = {
     8     'Host': 'm.weibo.cn',
     9     'Referer': 'https://m.weibo.cn/u/2830678474',
    10     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    11     'X-Requested-With': 'XMLHttpRequest',
    12 }
    13 client = MongoClient()
    14 db = client['weibo']
    15 collection = db['weibo']
    16 max_page = 10
    17 
    18 
    19 def get_page(page):
    20     params = {
    21         'type': 'uid',
    22         'value': '2830678474',
    23         'containerid': '1076032830678474',
    24         'page': page
    25     }
    26     url = base_url + urlencode(params)
    27     try:
    28         response = requests.get(url, headers=headers)
    29         if response.status_code == 200:
    30             return response.json(), page
    31     except requests.ConnectionError as e:
    32         print('Error', e.args)
    33 
    34 #遍历cards,获取博客的各个信息
    #保存微博的正文,赞数,评论数和转发数 35 def parse_page(json, page: int): 36 if json: 37 items = json.get('data').get('cards') 38 for index, item in enumerate(items): 39 if page == 1 and index == 1: 40 continue 41 else: 42 item = item.get('mblog') 43 weibo = {} 44 weibo['id'] = item.get('id') 45 weibo['text'] = pq(item.get('text')).text() 46 weibo['attitudes'] = item.get('attitudes_count') 47 weibo['comments'] = item.get('comments_count') 48 weibo['reposts'] = item.get('reposts_count') 49 yield weibo 50 51 52 def save_to_mongo(result): 53 if collection.insert(result): 54 print('Saved to Mongo') 55 56 #遍历page,得到有10页 57 if __name__ == '__main__': 58 for page in range(1, max_page + 1): 59 json = get_page(page) 60 results = parse_page(*json) 61 for result in results: 62 print(result) 63 save_to_mongo(result)
  • 相关阅读:
    1.初识Redis
    2.API的理解和使用
    8.rabbitmq RPC模拟微服务架构中的服务调用
    9.[完]其他常用的rabbitmq的参数和设置
    6.Header交换机之模拟验证用户身份
    C#中复制文件夹及文件的两种方法
    python通过递归将多维字典转化为二维
    python venv flask gunicorn 部署与 pycharm 连接
    Nagios(centos 6.5)调用NSClient++/NRPE+Powershell脚本(windows server 2008 r2)监控网络情况
    Nagios 调用华为云短信平台进行报警
  • 原文地址:https://www.cnblogs.com/chengchengaqin/p/9788624.html
Copyright © 2011-2022 走看看