zoukankan      html  css  js  c++  java
  • 微博爬虫

    使用网页手机模式进行爬取,这样就可以避开翻页需要登录的问题

    1.将存入数据库进行封装

    """
    CREATE TABLE weibo_data(
        id int primary key auto_increment,
        create_at varchar(30),
        reposts_count int,
        comments_count int,
        attitudes_count int,
        `text` text) default charset=utf8mb4;
    """
    import pymysql
    
    class WeiboMysql(object):
        # 初始化就是连接数据库
        def __init__(self):
            self.conn = pymysql.connect(host='127.0.0.1', user='root', passwd='510520', db='pachong', charset='utf8mb4')
            self.cursor = self.conn.cursor()
    
        def execute_insert_sql(self, sql, data):
            self.cursor.execute(sql, data)
            self.conn.commit()
    
        def __del__(self):
            self.cursor.close()
            self.conn.close()
    
    if __name__ == '__main__':
        weibo = WeiboMysql()
        insert_sql = "INSERT INTO weibo_data(create_at, reposts_count, comments_count, attitudes_count, `text`) VALUES(%s, %s, %s, %s, %s)"
        data = ('12-18', '123', '123', '123', '画画baby')
        weibo.execute_insert_sql(insert_sql, data)

    2.获取数据并存入数据库

    import requests
    import json
    # 导入remove_tags除去标签
    from w3lib.html import remove_tags
    # 导入自定义的WeiboMysql类
    from weibo.weibo_mysql import WeiboMysql
    
    weibo = WeiboMysql()
    
    # 伪装成浏览器
    headers = {
        'cookie': '_T_WM=99370732608; XSRF-TOKEN=6b3c2d; WEIBOCN_FROM=1110005030; MLOGIN=0; M_WEIBOCN_PARAMS=oid%3D4561830166403683%26lfid%3D102803%26luicode%3D20000174%26fid%3D1005051211441627%26uicode%3D10000011',
        'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Mobile Safari/537.36',
        'referer': 'https://m.weibo.cn/u/1211441627',
    }
    
    def get_page_data(url):
        response = requests.get(url, headers=headers)
        # 将获取到的数据转换为json格式
        res_dict = json.loads(response.text)
    
        cards_list = res_dict['data']['cards']
        insert_sql = "INSERT INTO weibo_data(create_at, reposts_count, comments_count, attitudes_count, `text`) VALUES(%s, %s, %s, %s, %s)"
        for card in cards_list:
            if 'mblog' in card:
                text = remove_tags(card['mblog']['text'])
                created_at = card['mblog']['created_at']
                reposts_count = card['mblog']['reposts_count']
                comments_count = card['mblog']['comments_count']
                attitudes_count = card['mblog']['attitudes_count']
                data = (created_at, reposts_count, comments_count, attitudes_count, text)
                print(data)
                # 存入数据库
                weibo.execute_insert_sql(insert_sql, data)
    
        # 返回当前url下的since_id,这是下一个url携带的参数since_id
        since_id = res_dict['data']['cardlistInfo']['since_id']
        return since_id
    
    def main():
        url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=1211441627&containerid=1076031211441627'
        since_id = get_page_data(url)
        # 循环进行爬取
        while True:
            try:
                next_url = url + '&since_id=' + str(since_id)
                print(next_url)
                since_id = get_page_data(next_url)
            except:
                break
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    图论-最短路
    windows对拍及其应用
    RMQ与st表
    树状数组
    二分和三分题
    [转载]图论500题
    浏览器请求背后的网络数据传输过程
    百度ocr文字识别接口使用
    Mysql启动报错解决方案:Failed to open log (file './mysql-bin.000901', errno 2)
    Mac环境下nginx https配置
  • 原文地址:https://www.cnblogs.com/glz666/p/13859303.html
Copyright © 2011-2022 走看看