zoukankan      html  css  js  c++  java
  • 微博爬虫

    使用网页手机模式进行爬取,这样就可以避开翻页需要登录的问题

    1.将存入数据库进行封装

    """
    CREATE TABLE weibo_data(
        id int primary key auto_increment,
        create_at varchar(30),
        reposts_count int,
        comments_count int,
        attitudes_count int,
        `text` text) default charset=utf8mb4;
    """
    import pymysql
    
    class WeiboMysql(object):
        # 初始化就是连接数据库
        def __init__(self):
            self.conn = pymysql.connect(host='127.0.0.1', user='root', passwd='510520', db='pachong', charset='utf8mb4')
            self.cursor = self.conn.cursor()
    
        def execute_insert_sql(self, sql, data):
            self.cursor.execute(sql, data)
            self.conn.commit()
    
        def __del__(self):
            self.cursor.close()
            self.conn.close()
    
    if __name__ == '__main__':
        weibo = WeiboMysql()
        insert_sql = "INSERT INTO weibo_data(create_at, reposts_count, comments_count, attitudes_count, `text`) VALUES(%s, %s, %s, %s, %s)"
        data = ('12-18', '123', '123', '123', '画画baby')
        weibo.execute_insert_sql(insert_sql, data)

    2.获取数据并存入数据库

    import requests
    import json
    # 导入remove_tags除去标签
    from w3lib.html import remove_tags
    # 导入自定义的WeiboMysql类
    from weibo.weibo_mysql import WeiboMysql
    
    weibo = WeiboMysql()
    
    # 伪装成浏览器
    headers = {
        'cookie': '_T_WM=99370732608; XSRF-TOKEN=6b3c2d; WEIBOCN_FROM=1110005030; MLOGIN=0; M_WEIBOCN_PARAMS=oid%3D4561830166403683%26lfid%3D102803%26luicode%3D20000174%26fid%3D1005051211441627%26uicode%3D10000011',
        'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Mobile Safari/537.36',
        'referer': 'https://m.weibo.cn/u/1211441627',
    }
    
    def get_page_data(url):
        response = requests.get(url, headers=headers)
        # 将获取到的数据转换为json格式
        res_dict = json.loads(response.text)
    
        cards_list = res_dict['data']['cards']
        insert_sql = "INSERT INTO weibo_data(create_at, reposts_count, comments_count, attitudes_count, `text`) VALUES(%s, %s, %s, %s, %s)"
        for card in cards_list:
            if 'mblog' in card:
                text = remove_tags(card['mblog']['text'])
                created_at = card['mblog']['created_at']
                reposts_count = card['mblog']['reposts_count']
                comments_count = card['mblog']['comments_count']
                attitudes_count = card['mblog']['attitudes_count']
                data = (created_at, reposts_count, comments_count, attitudes_count, text)
                print(data)
                # 存入数据库
                weibo.execute_insert_sql(insert_sql, data)
    
        # 返回当前url下的since_id,这是下一个url携带的参数since_id
        since_id = res_dict['data']['cardlistInfo']['since_id']
        return since_id
    
    def main():
        url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=1211441627&containerid=1076031211441627'
        since_id = get_page_data(url)
        # 循环进行爬取
        while True:
            try:
                next_url = url + '&since_id=' + str(since_id)
                print(next_url)
                since_id = get_page_data(next_url)
            except:
                break
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    python3.4+pyspider爬58同城(二)
    pyspider安装后,点击run,报pyhton has stop working或python已停止运行的错误
    PermissionError: [WinError 32] 另一个程序正在使用此文件,进程无法访问。: 'C:\Users\video\AppData\Local\Temp\tmpfipzk8ma'--问题解决
    使用firefoxprofile,selenium设置firefox,初始化firefox
    排序算法讲解
    Java寫聊天小程序
    csproj项目工程文件的脚本/动态链接库设置
    常见的内存加密防破解及安全方案
    Animator直接引用FBX下的AnimClip与直接引用单独的AnimClip的对比
    Jupyter多内核的手动配置(Python多版本)
  • 原文地址:https://www.cnblogs.com/glz666/p/13859303.html
Copyright © 2011-2022 走看看