zoukankan      html  css  js  c++  java
  • 练习3-微博爬取

    微博分页采用since_id,下一页的since_id在上一页的response中

    from urllib.parse import urlencode
    import requests
    from pyquery import PyQuery as PQ
    base_url='https://m.weibo.cn/api/container/getIndex?'
    
    headers = {
        'Host':'m.weibo.cn',
        'Referer':'https://m.weibo.cn/u/2830678474',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',}
    
    def get_page(since_id):
        param = {
            'type':'uid',
            'value':'2830678474',
            'containerid':'1076032830678474',
            'since_id':since_id
        }
        url = base_url + urlencode(param)
        try:
            response = requests.get(url,headers=headers)
            if response.status_code == 200:
                return response.json()
        except Exception as e:
            print('Error:',e)
    
    def parse_page(json):
        if json:
            items = json.get('data').get('cards')
            for item in items:
                item = item.get('mblog')
                weibo = {}
                weibo['id'] = item.get('id')
                weibo['text'] = PQ(item.get('text')).text()
                weibo['attitudes'] = item.get('attitudes_count')
                weibo['comments'] = item.get('comments_count')
                weibo['reposts'] = item.get('reposts_count')
                yield weibo
    
    if __name__ == '__main__':
        for i in range(10):
            if i == 0:
                results = get_page('')
            else:
                results = get_page(results.get('data').get('cardlistInfo').get('since_id'))
            for result in parse_page(results):
                print(result)
    
  • 相关阅读:
    31
    30
    29
    28
    27
    26
    25
    23
    cesium 基础
    操作SDO_GEOMETRY字段
  • 原文地址:https://www.cnblogs.com/tingshu/p/14770945.html
Copyright © 2011-2022 走看看