zoukankan      html  css  js  c++  java
  • scrapy 爬取智联招聘

    准备工作

      1. scrapy startproject Jobs
      2. cd Jobs
      3. scrapy genspider ZhaopinSpider www.zhaopin.com
      4. scrapy crawl ZhaopinSpider
      5. pip install diskcache
      6. pip install tinydb
      7. scrapy crawl ZhaopinSpider -o chongqing.json

    ZhaopinSpider

     

    # -*- coding: utf-8 -*-
    import os
    import json
    
    from tinydb import TinyDB, Query
    from furl import furl
    import scrapy
    
    
    class ZhaopinspiderSpider(scrapy.Spider):
        name = 'ZhaopinSpider'
        allowed_domains = ['www.zhaopin.com', 'sou.zhaopin.com', 'fe-api.zhaopin.com']
        start_urls = ['https://www.zhaopin.com/citymap']
        cache_db = TinyDB('ZhaopinSpider-cache.json')  # 缓存数据库
        allowed_cities = ['重庆', ]# '成都', '上海', '深圳', '昆明', '杭州', '贵阳', '宁波']  ## 允许的城市
        F = furl('https://fe-api.zhaopin.com/c/i/sou?pageSize=90&kt=3')  # URL母版
        PAGE_SIZE = 90  # 分页大小
    
        def get_city_code(self, city_name):
            '''(根据城市名)获取城市代码'''
            Q = Query()
            city = self.cache_db.get(Q.name.search(city_name))
            if isinstance(city, dict):
                return city['code']
            else:
                print('@' * 100)
                print(type(city))
    
        def init_city_info(self, response):
            '''初始化城市信息'''
            # 取源码
            script_text = response.xpath('//script[text()[contains(., "__INITIAL_STATE__")]]/text()').extract_first()
            # 去收尾空格
            script_text = script_text.strip()
            # 预处理为符合json规范的数据
            script_json = script_text[script_text.index('=') + 1:]
            # 将json字符串转为字典
            script_dict = json.loads(script_json)
            '''
            # 存储取得的json, 便于调试查看
            with open('text.json', 'wt', encoding='utf-8') as f:
                json.dump(script_dict, f, indent=4, ensure_ascii=False)
            '''
            '''
            city_list = []  # 存储城市列表
            # 将字典中的城市提取到列表中,便于查找
            for ch in script_dict['cityList']['cityMapList']:
                city_list.extend(script_dict['cityList']['cityMapList'][ch])
            # 筛选出重庆,并获取城市码
            city_code = (list(filter(lambda city: city['name'] == '重庆', city_list)) or [{'code': None}])[0]['code']
            '''
            for ch in script_dict['cityList']['cityMapList']:
                for city in script_dict['cityList']['cityMapList'][ch]:
                    self.cache_db.insert(city)
    
        def parse(self, response):
            # if not os.path.exists('ZhaopinSpider-cache.json'):
            if not bool(self.eache_db.all()):
                self.init_city_info(response)
            # 迭代每一个要爬取的城市
            for city_name in self.allowed_cities:
                # 启动 爬取某个城市 第一个请求
                # import ipdb; ipdb.set_trace()
                yield self.request_city(city_name)
    
        def request_city(self, city_name, page_start=0):
            '''构造 爬取某个具体的城市 的请求对象'''
            city_code = self.get_city_code(city_name)
            url_data = {
                'cityId': city_code,
                'kw': 'python',
                'start': page_start
            }
            # 要爬取的页面的URL
            url = self.F.copy().add(url_data).url
            # import ipdb; ipdb.set_trace()
            req = scrapy.Request(url, callback=self.parse_city, dont_filter=False)
            # 使用 meta 传递附加数据,在 callback 中可以通过 respo.meta 取得
            req.meta['city_name'] = city_name
            req.meta['page_start'] = page_start
            return req
    
        def parse_city(self, response):
            '''解析具体的页面'''
            # 解析json格式的响应结果
            resp_dict = json.loads(response.body_as_unicode())
            # 总共所能爬取的条数
            num_found = resp_dict['data']['numFound']
            # 获取当前请求的 page_start
            page_start = response.meta['page_start']
            # 下一次请求,需要的 start 参数
            next_start = page_start + self.PAGE_SIZE
            # import ipdb; ipdb.set_trace()
            # 判断是否有下一页
            if next_start < num_found:
                # 获取当前请求的 城市名
                city_name = response.meta['city_name']
                # 发送下一页请求
                yield self.request_city(city_name, page_start=next_start)
            # 解析数据
            for item in resp_dict['data']['results']:
                # TODO: 解析数据,只取我们需要的信息
                item['spiderName'] = self.name
                # 返回每一条数据
                yield item

  • 相关阅读:
    用人之道(一) 如何组建软件开发队伍[转]
    用人之道(二) 何管理软件开发团队[转]
    2005年度世界500强公司名单[转]
    人类的15个欲望与游戏设计[转&收藏]
    Flash读取Cookie[转]
    高效程序员应该养成的七个习惯
    六度隔离学说,1967年,哈佛大学,Stanley Milgram
    做技术,切不可沉湎于技术[转&收藏]
    庆祝VSX团队成立,加入VSX团队申请帖
    如何把菜单添加到另外一个VSPackage的菜单里?
  • 原文地址:https://www.cnblogs.com/sxqfuture/p/10256437.html
Copyright © 2011-2022 走看看