zoukankan      html  css  js  c++  java
  • 使用scrapy爬取百度招聘

    百度招聘都是通过ajax返回的数据,用scrapy爬就很尴尬了。
    建模,items文件:

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define here the models for your scraped items
     4 #
     5 # See documentation in:
     6 # http://doc.scrapy.org/en/latest/topics/items.html
     7 
     8 import scrapy
     9 
    10 
    11 class BaiduItem(scrapy.Item):
    12     # 职位名称
    13     job_name = scrapy.Field()
    14     # 职位类别
    15     job_type = scrapy.Field()
    16     # 工作地点
    17     address = scrapy.Field()
    18     # 招聘人数
    19     number = scrapy.Field()
    20     # 更新时间
    21     pub_time = scrapy.Field()
    22     # 详情页面  ref="#/jobDetail/2/1345536716"
    23     detail_link = scrapy.Field()
    24     # 工作职责
    25     duty = scrapy.Field()
    26     # 职责要求
    27     require = scrapy.Field()
    28     pass
    items.py

    settings.py文件如下,设定了MONGO_HOST,下载器中间件。

     1 # -*- coding: utf-8 -*-
     2 
     3 # Scrapy settings for BaiDu project
     4 #
     5 # For simplicity, this file contains only settings considered important or
     6 # commonly used. You can find more settings consulting the documentation:
     7 #
     8 #     http://doc.scrapy.org/en/latest/topics/settings.html
     9 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    10 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    11 
    12 BOT_NAME = 'BaiDu'
    13 
    14 SPIDER_MODULES = ['BaiDu.spiders']
    15 NEWSPIDER_MODULE = 'BaiDu.spiders'
    16 
    17 MONGO_HOST = "127.0.0.1"
    18 MONGO_PORT = 27017
    19 MONGO_DBNAME = "baidu"
    20 MONGO_COLNAME = "zhaopin"
    21 
    22 # Crawl responsibly by identifying yourself (and your website) on the user-agent
    23 #USER_AGENT = 'BaiDu (+http://www.yourdomain.com)'
    24 
    25 # Obey robots.txt rules
    26 ROBOTSTXT_OBEY = True
    27 
    28 # Configure maximum concurrent requests performed by Scrapy (default: 16)
    29 #CONCURRENT_REQUESTS = 32
    30 
    31 # Configure a delay for requests for the same website (default: 0)
    32 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
    33 # See also autothrottle settings and docs
    34 #DOWNLOAD_DELAY = 3
    35 # The download delay setting will honor only one of:
    36 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    37 #CONCURRENT_REQUESTS_PER_IP = 16
    38 
    39 # Disable cookies (enabled by default)
    40 #COOKIES_ENABLED = False
    41 
    42 # Disable Telnet Console (enabled by default)
    43 #TELNETCONSOLE_ENABLED = False
    44 
    45 # Override the default request headers:
    46 #DEFAULT_REQUEST_HEADERS = {
    47 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    48 #   'Accept-Language': 'en',
    49 #}
    50 
    51 # Enable or disable spider middlewares
    52 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    53 #SPIDER_MIDDLEWARES = {
    54 #    'BaiDu.middlewares.BaiduSpiderMiddleware': 543,
    55 #}
    56 
    57 # Enable or disable downloader middlewares
    58 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    59 #DOWNLOADER_MIDDLEWARES = {
    60 #    'BaiDu.middlewares.MyCustomDownloaderMiddleware': 543,
    61 #}
    62 
    63 # Enable or disable extensions
    64 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
    65 #EXTENSIONS = {
    66 #    'scrapy.extensions.telnet.TelnetConsole': None,
    67 #}
    68 
    69 # Configure item pipelines
    70 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
    71 ITEM_PIPELINES = {
    72    'BaiDu.pipelines.BaiduPipeline': 300,
    73    'BaiDu.pipelines.MongPipeline': 301,
    74 }
    75 
    76 # Enable and configure the AutoThrottle extension (disabled by default)
    77 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
    78 #AUTOTHROTTLE_ENABLED = True
    79 # The initial download delay
    80 #AUTOTHROTTLE_START_DELAY = 5
    81 # The maximum download delay to be set in case of high latencies
    82 #AUTOTHROTTLE_MAX_DELAY = 60
    83 # The average number of requests Scrapy should be sending in parallel to
    84 # each remote server
    85 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    86 # Enable showing throttling stats for every response received:
    87 #AUTOTHROTTLE_DEBUG = False
    88 
    89 # Enable and configure HTTP caching (disabled by default)
    90 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    91 #HTTPCACHE_ENABLED = True
    92 #HTTPCACHE_EXPIRATION_SECS = 0
    93 #HTTPCACHE_DIR = 'httpcache'
    94 #HTTPCACHE_IGNORE_HTTP_CODES = []
    95 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    settings

    主要的代码来了spider(baidu.py)文件如下,找到了百度的ajax请求获取数据。

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 import requests
     4 import json
     5 from BaiDu.items import BaiduItem
     6 
     7 
     8 class BaiduSpider(scrapy.Spider):
     9     name = 'baidu'
    10     allowed_domains = ['baidu.com']
    11     # start_urls = ['http://talent.baidu.com/external/baidu/index.html#/social/2']
    12     # 百度是ajax,传入ajax的url
    13     start_urls = [
    14         'http://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9&recruitType=2&pageSize=10&curPage=1']
    15 
    16     def start_requests(self):
    17         i = 1
    18         while True:
    19             url = 'http://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9&recruitType=2&pageSize=10&curPage={}'.format(
    20                 i)
    21             i = i + 1
    22             try:
    23                 # 到最后一页应该会报错吧
    24                 yield scrapy.FormRequest(url)
    25             except Exception as e:
    26                 print(e)
    27                 break
    28 
    29     def parse(self, response):
    30         # with open('baidu.html', 'wb') as f:  # debug
    31         #     f.write(response.body)
    32         host = 'http://talent.baidu.com/external/baidu/index.html#/jobDetail/2/'
    33         item = BaiduItem()
    34         json_datas = json.loads(response.body)['postList']  # 每次返回一页的十条数据
    35         for json_data in json_datas:
    36             item['job_name'] = json_data['name']
    37             item['job_type'] = json_data['postType']
    38             item['address'] = json_data['workPlace']
    39             item['number'] = json_data['recruitNum']
    40             item['pub_time'] = json_data['publishDate']
    41             item['detail_link'] = host + str(json_data['postId'])
    42             item['duty'] = json_data['workContent']
    43             item['require'] = json_data['serviceCondition']
    44             # todo 字符串需要处理
    45             # print('------------------',item)
    46             yield item
    baidu.py

    pipelines.py文件,把数据保存成json格式,插入到mongo数据库。

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define your item pipelines here
     4 #
     5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
     6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
     7 import json
     8 from scrapy.conf import settings
     9 from pymongo import MongoClient
    10 
    11 
    12 class BaiduPipeline(object):
    13     """保存json格式"""
    14 
    15     def __init__(self):
    16         self.file = open('baidu.json', 'wb')
    17 
    18     def process_item(self, item, spider):
    19         str_data = json.dumps(dict(item), ensure_ascii=False) + ',
    '
    20         self.file.write(str_data.encode())
    21         return item
    22 
    23     def close_spider(self):
    24         self.file.close()
    25 
    26 
    27 class MongPipeline(object):
    28     """将数据插入到本地mongo数据库中,框架爬虫很快,插入数据稍慢"""
    29 
    30     def __init__(self):
    31         host = settings['MONGO_HOST']
    32         port = settings['MONGO_PORT']
    33         dbname = settings['MONGO_DBNAME']
    34         colname = settings['MONGO_COLNAME']
    35 
    36         # 链接数据库
    37         self.client = MongoClient(host, port)
    38         # 选择数据库
    39         self.db = self.client[dbname]
    40         # 选择集合
    41         self.col = self.db[colname]
    42 
    43     def process_item(self, item, spider):
    44         dict_data = dict(item)
    45         self.col.insert(dict_data)
    46         return item
    47 
    48     def close_spider(self, spider):
    49         self.client.close()
    piplines.py
  • 相关阅读:
    160-13. 罗马数字转整数
    159-118. 杨辉三角
    158-190. 颠倒二进制位
    157-461. 汉明距离
    156-412. Fizz Buzz
    155-278. 第一个错误的版本
    154-108. 将有序数组转换为二叉搜索树
    153-101. 对称二叉树
    152-234. 回文链表
    秒杀程序架构演进
  • 原文地址:https://www.cnblogs.com/jianxiaoguo/p/7667546.html
Copyright © 2011-2022 走看看