zoukankan      html  css  js  c++  java
  • Scrapy爬取知乎用户信息

    创建项目
    scrapy startproject zhihuuser

    scrapy genspider zhihu zhihu.com

    items.py

    from scrapy import Item, Field
    
    class UserItem(Item):
        # define the fields for your item here like:
        id = Field()
        name = Field()
        avatar_url = Field()
        headline = Field()
        description = Field()
        url = Field()
        url_token = Field()
        gender = Field()
        cover_url = Field()
        type = Field()
        badge = Field()
    
        answer_count = Field()
        articles_count = Field()
        commercial_question_count = Field()
        favorite_count = Field()
        favorited_count = Field()
        follower_count = Field()
        following_columns_count = Field()
        following_count = Field()
        pins_count = Field()
        question_count = Field()
        thank_from_count = Field()
        thank_to_count = Field()
        thanked_count = Field()
        vote_from_count = Field()
        vote_to_count = Field()
        voteup_count = Field()
        following_favlists_count = Field()
        following_question_count = Field()
        following_topic_count = Field()
        marked_answers_count = Field()
        mutual_followees_count = Field()
        hosted_live_count = Field()
        participated_live_count = Field()
    
        locations = Field()
        educations = Field()
        employments = Field()

    zhihu.py

    import json
    
    from scrapy import Spider, Request
    from zhihuuser.items import UserItem
    
    
    class ZhihuSpider(Spider):
        name = "zhihu"
        allowed_domains = ["www.zhihu.com"]
        user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
        follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
        followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
        start_user = 'excited-vczh'
        user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
        follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
        followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
    
        def start_requests(self):
            yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
            yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, limit=20, offset=0),
                          self.parse_follows)
            yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, limit=20, offset=0),
                          self.parse_followers)
    
        def parse_user(self, response):
            result = json.loads(response.text)
            item = UserItem()
    
            for field in item.fields:
                if field in result.keys():
                    item[field] = result.get(field)
            yield item
    
            yield Request(
                self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0),
                self.parse_follows)
    
            yield Request(
                self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0),
                self.parse_followers)
    
        def parse_follows(self, response):
            results = json.loads(response.text)
    
            if 'data' in results.keys():
                for result in results.get('data'):
                    yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                                  self.parse_user)
    
            if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
                next_page = results.get('paging').get('next')
                yield Request(next_page,
                              self.parse_follows)
    
        def parse_followers(self, response):
            results = json.loads(response.text)
    
            if 'data' in results.keys():
                for result in results.get('data'):
                    yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                                  self.parse_user)
    
            if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
                next_page = results.get('paging').get('next')
                yield Request(next_page,
                              self.parse_followers)

     pipelines.py

    import pymongo
    
    
    class ZhihuPipeline(object):
        def process_item(self, item, spider):
            return item
    
    
    class MongoPipeline(object):
        collection_name = 'users'
    
        def __init__(self, mongo_uri, mongo_db):
            self.mongo_uri = mongo_uri
            self.mongo_db = mongo_db
    
        @classmethod
        def from_crawler(cls, crawler):
            return cls(
                mongo_uri=crawler.settings.get('MONGO_URI'),
                mongo_db=crawler.settings.get('MONGO_DATABASE')
            )
    
        def open_spider(self, spider):
            self.client = pymongo.MongoClient(self.mongo_uri)
            self.db = self.client[self.mongo_db]
    
        def close_spider(self, spider):
            self.client.close()
    
        def process_item(self, item, spider):
            self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True)
            return item
    settings.py
    BOT_NAME = 'zhihuuser'
    
    SPIDER_MODULES = ['zhihuuser.spiders']
    NEWSPIDER_MODULE = 'zhihuuser.spiders'
    
    ROBOTSTXT_OBEY = False
    
    DEFAULT_REQUEST_HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
    }
    
    ITEM_PIPELINES = {
        'zhihuuser.pipelines.MongoPipeline': 300,
        # 'scrapy_redis.pipelines.RedisPipeline': 301
    }
    
    MONGO_URI = 'localhost'
    MONGO_DATABASE = 'zhihu'
  • 相关阅读:
    day13—— luffy项目 User表的配置
    day13 —— luffy项目 数据库配置
    day13- luffy项目 后端搭建
    Vue环境搭建
    Python创建虚拟环境
    pip换源
    day12--DRF 进阶7 JWT补充、基于权限的角色控制、django缓存
    C语言输出格式总结-printf()
    PAT 甲级 1108 Finding Average (20分)
    DAY30
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9217598.html
Copyright © 2011-2022 走看看