zoukankan      html  css  js  c++  java
  • python之crawlscrapy爬取某集团招聘信息以及招聘详情

    针对这种招聘信息,使用crawlscrapy很适合。

    1、settings.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for gosuncn project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'gosuncn'
    
    SPIDER_MODULES = ['gosuncn.spiders']
    NEWSPIDER_MODULE = 'gosuncn.spiders'
    
    LOG_LEVEL="WARNING"
    LOG_FILE="./gxx.log"
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'gosuncn (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = True
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'gosuncn.middlewares.GosuncnSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'gosuncn.middlewares.GosuncnDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'gosuncn.pipelines.GosuncnPipeline': 300,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    View Code

    2、pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import logging
    import re
    logger = logging.getLogger(__name__)
    class GosuncnPipeline(object):
        def process_item(self, item, spider):
            """
            数据处理在pipelines中进行
            :param item: 
            :param spider: 
            :return: 
            """
            item["job_responsible"] = re.sub(r"<p>
                        ","",item["job_responsible"])
            item["job_responsible"] = re.sub(r"
                    </p>", "", item["job_responsible"])
            item["job_responsible"] = re.sub(r"(<br>{1,2})", "", item["job_responsible"])
            item["job_responsible"] = re.sub(r"	", "", item["job_responsible"])
            logger.warning(item)
            print(item)
            return item
    View Code

    3、gxx.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    import re
    import logging
    logger = logging.getLogger(__name__)
    class GxxSpider(CrawlSpider):
    
        name = 'gxx'
        allowed_domains = ['gosuncn.zhiye.com']
        start_urls = ['https://gosuncn.zhiye.com/social/?PageIndex=1']
    
        rules = (
            Rule(LinkExtractor(allow=r'/zpdetail/d+?PageIndex=d'), callback='parse_item',), #获取详情页信息
            Rule(LinkExtractor(allow=r'/social/?PageIndex=d+'),  follow=True), #翻页
        )
    
        def parse_item(self, response):
            item = {}
            item["job_name"] = response.xpath("//div[@class='boxSupertitle']/span/text()").extract_first() #工作名
            ul_list = response.xpath("//div[@class='xiangqingcontain']/ul[1]")
            for ul in ul_list:
                item["recuirt_type"] = ul.xpath("./li[2]/text()").extract_first()
                item["recuirt_type"] = re.sub("
                       ", "", item["recuirt_type"])#招聘类型
                item["recuirt_type"] = re.sub("
                    ", "", item["recuirt_type"])
                item["job_type"] = ul.xpath("./li[4]/text()").extract_first()
                item["job_type"] = re.sub("
                       ", "", item["job_type"])
                item["job_type"] = re.sub("
                    ", "", item["job_type"]) #工作类型
                item["pay_money"] = ul.xpath("./li[6]/text()").extract_first() #薪资
                item["pay_money"] = re.sub("
                       ", "", item["pay_money"])  # 招聘类型
                item["pay_money"] = re.sub("
                    ", "", item["pay_money"])
    
                item["publish_time"] = re.findall("20d+-d+-d+", response.body.decode())[0]  # 发布时间
                item["recuirt_num"] = ul.xpath("./li[8]/text()").extract_first() #招聘人数
                item["recuirt_num"] = re.sub("
                       ", "", item["recuirt_num"])  # 招聘类型
                item["recuirt_num"] = re.sub("
                    ", "", item["recuirt_num"])
    
                item["job_place"] = response.xpath("//div[@class='xiangqingcontain']/ul[3]/li[2]/text()").extract_first()
                item["job_place"] = re.sub("
    
                        ", "", item["job_place"])  # 招聘类型
                item["job_place"] = re.sub("
                    ", "", item["job_place"])
                #logger.warning(item)
                #print(item)
    
            item["job_responsible"] = response.xpath("//div[@class='xiangqingtext']/p[2]").extract_first()
    
            yield item
                # for li in li_list:
                #     li.xpath("")
    
    
            #item["publish_time"] =response.xpath("/html/body/div/div[3]/div/div[1]/div/div/div/div[2]/ul[2]/li[2]/text()").extract_first()
            #
            #item["publish_time"] = re.findall("20d+-d+-d+",response.body.decode())[0] #发布时间
    
            #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
            #item['name'] = response.xpath('//div[@id="name"]').get()
            #item['description'] = response.xpath('//div[@id="description"]').get()
            #print(item)
            #return item
  • 相关阅读:
    1、搭建CICD平台
    Jackson 的 基本用法
    多种方式实现动态替换Android默认桌面Launcher
    Springboot2.x整合Redis以及连接哨兵模式/集群模式
    Redis哨兵(Sentinel)模式
    redis的哨兵模式(redis有密码)
    【数据结构】——LCT(link cut tree)
    征战蓝桥 —— 2017年第八届 —— C/C++A组第3题——魔方状态
    征战蓝桥 —— 2017年第八届 —— C/C++A组第4题——方格分割
    征战蓝桥 —— 2017年第八届 —— C/C++A组第4题——方格分割
  • 原文地址:https://www.cnblogs.com/ywjfx/p/11097970.html
Copyright © 2011-2022 走看看