zoukankan      html  css  js  c++  java
  • requests爬取知乎话题和子话题

    zhihu.py

    # *_*coding:utf-8 *_*
    import pymysql
    import requests
    from lxml import etree
    
    from requests_test.child_topic import GetChildTopic
    from requests_test.parent_topic import GetParentTopic
    
    if __name__ == "__main__":
        parent = GetParentTopic()
        res = parent.get_parent_data()
        # child  = GetChildTopic()
        # child.get_child_data(1027,2)
        child = GetChildTopic()
        for i in res:
            print("parent_id:",i)
            child.get_child_data(i,50)
    

      

    parent_topic.py
    # *_*coding:utf-8 *_*
    import pymysql
    from lxml import etree
    
    import requests
    
    class GetParentTopic(object):
        def __init__(self):
            self.conn = pymysql.connect(host='192.168.33.10', user='root', passwd='root', db='spider', charset='utf8')
            self.cur = self.conn.cursor()
    
        def get_parent_data(self):
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
            }
            url = 'https://www.zhihu.com/topics'
    
            response = requests.get(url, headers=headers)
            res = response.text
    
            html = etree.HTML(res)
            ul = html.xpath("//ul[@class='zm-topic-cat-main clearfix']/li");
    
            parent_topic = {}
    
            for li in ul:
                title = li.xpath('./a/text()')[0];
                topic_id = li.xpath('./@data-id')[0];
                parent_topic[topic_id] = title
                import time
    
                # 格式化成2016-03-20 11:45:39形式
                now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                # 插入数据
                sql = "insert ignore   into topic(`title`,`topic_id`,`create_time`) values('{}','{}','{}')".format(title,
                                                                                                             topic_id, now)
                #print(sql)
                reCount = self.cur.execute(sql)
                self.conn.commit()
    
            self.cur.close()
            self.conn.close()
            return parent_topic
    

      

    child_topic.py
    # *_*coding:utf-8 *_*
    import json
    import urllib
    from time import sleep
    
    import pymysql
    from lxml import etree
    import requests
    
    class GetChildTopic(object):
        def __init__(self):
            self.conn = pymysql.connect(host='192.168.33.10', user='root', passwd='root', db='spider', charset='utf8')
            self.cur = self.conn.cursor()
    
        def sql_filter(self,sql, max_length=20):
            dirty_stuff = [""", "\", "/", "*", "'", "=", "-", "#", ";", "<", ">", "+", "%", "$", "(", ")", "%", "@", "!"]
            for stuff in dirty_stuff:
                sql = sql.replace(stuff, "")
            return sql[:max_length]
    
        def get_child_data(self,parent_id, total_pages):
            int(parent_id)
    
            for page in range(1, total_pages + 1):
                #sleep(1)
                output = []
                print("now_parent_id",parent_id,"now_page:",page)
                url = "https://www.zhihu.com/node/TopicsPlazzaListV2"
                headers = {
                    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
                }
                offset = (page - 1) * 20
                data = {'method': 'next', "params": json.dumps({"topic_id": parent_id, "offset": offset, "hash_id": ""})}
                response = requests.post(url, data=data, headers=headers)
                print(url,response,);
                print(data)
    
                res = response.json()['msg']
                if(len(res) < 0):
                    break;
                for item in res:
                    html = etree.HTML(item)
                    title = html.xpath('//img/@alt')[0]
                    img_url = html.xpath('//img/@src')[0]
                    topic_url = html.xpath('//a[1]/@href')[0]
                    topic_id = topic_url.split('/')[-1]
                    topic_url = urllib.parse.urljoin(url, topic_url)
                    desc = html.xpath('//p/text()')
                    if desc is not None and len(desc) == 1:
                        desc = desc[0]
                    else:
                        desc = ''
    
                    title = self.sql_filter(title, 200)
                    img_url = self.sql_filter(img_url, 200)
                    topic_url = self.sql_filter(topic_url, 200)
                    desc = self.sql_filter(desc, 200)
    
                    output.append({'title': title, 'img_url': img_url, "topic_url": topic_url, "desc": desc, "topic_id": topic_id,'parent_id': parent_id})
                print(output)
                self.save_child_topic(output)
    
    
    
    
        def save_child_topic(self,data):
            for item in data:
                import time
                # 格式化成2016-03-20 11:45:39形式
                now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                # 插入数据
                sql = "insert  ignore into topic(`title`,`topic_id`,`img_url`,`parent_id`,`desc`,`topic_url`,`level`,`create_time`) values('{}','{}','{}','{}','{}','{}','{}','{}')".format(
                    item['title'], item['topic_id'], item['img_url'], item['parent_id'], item['desc'], item['topic_url'], 1,
                    now)
                #print(sql)
                reCount = self.cur.execute(sql)
                self.conn.commit()
    
    
        def __del__(self):
            self.cur.close()
            self.conn.close()
    

      sql

    CREATE TABLE `topic` (
      `id` int(11) NOT NULL AUTO_INCREMENT,
      `title` varchar(255) NOT NULL DEFAULT '' COMMENT '标题',
      `topic_id` int(11) NOT NULL,
      `img_url` varchar(255) NOT NULL DEFAULT '' COMMENT '子标题图片',
      `parent_id` int(11) NOT NULL DEFAULT '0',
      `desc` text,
      `create_time` varchar(255) NOT NULL DEFAULT '',
      `topic_url` varchar(255) DEFAULT '' COMMENT '子标题超链接',
      `level` tinyint(4) NOT NULL DEFAULT '0' COMMENT '0父级 ',
      PRIMARY KEY (`id`),
      UNIQUE KEY `uni_top_par` (`topic_id`,`parent_id`),
      KEY `index_parent_id` (`parent_id`),
      KEY `index_topic_id` (`topic_id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=8379 DEFAULT CHARSET=utf8mb4;
    

      

  • 相关阅读:
    SQL 数据库 复制 与订阅 实现数据同步
    SQL 2008配置管理工具服务显示 远程过程调用失败0x800706be
    SQL2005中使用identity_insert向自动增量字段中写入内
    【树莓派】【转载】基于树莓派,制作家庭媒体中心+下载机
    Linux 按时间批量删除文件(删除N天前文件)
    【树莓派】为树莓派配置或扩展swap分区
    开源硬件相关平台
    【树莓派】树莓派上刷android系统
    【树莓派】树莓派上面安装配置teamviewer
    【树莓派】使用xdrp远程登录树莓派的图形界面
  • 原文地址:https://www.cnblogs.com/brady-wang/p/9714973.html
Copyright © 2011-2022 走看看