zoukankan      html  css  js  c++  java
  • 2019.10.24

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from ..items import JobscrawlerQianchengwuyouItem
     4 import datetime
     5 
     6 class QianchengSpiderSpider(scrapy.Spider):
     7     name = 'qiancheng_spider'
     8     # allowed_domains = ['qq.com']
     9     start_urls = [
    10         # 数据分析师
    11         'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
    12         # 人工智能
    13         'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
    14         # 算法工程师
    15         'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E7%25AE%2597%25E6%25B3%2595%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
    16         # 深度学习
    17         'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%25B7%25B1%25E5%25BA%25A6%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
    18         # 数据挖掘
    19         'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
    20         # 机器学习
    21         'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
    22     ]
    23     start_url_tags=[
    24         "数据分析师",
    25         "人工智能",
    26         "算法工程师",
    27         "深度学习",
    28         "数据挖掘",
    29         "机器学习",
    30     ]
    31 
    32     def __init__(self):
    33         self.record_date = datetime.datetime.now().strftime('%Y-%m-%d')
    34 
    35     def start_requests(self):
    36         for index in range(len(self.start_urls)):
    37             url = self.start_urls[index]
    38             tag = self.start_url_tags[index]
    39             yield scrapy.Request(url,callback=self.parse,meta={'tag':tag},dont_filter=True)
    40 
    41     def parse(self, response):
    42         tag = response.meta['tag']
    43         xpath = '//div[@class="el"]'
    44         items = response.xpath(xpath)
    45         for item in items:
    46             if not len(item.xpath('./p[@class="t1 "]')):
    47                 continue
    48             url = item.xpath('./p[@class="t1 "]//a/@href').extract_first()
    49             title = item.xpath('./p[@class="t1 "]//a/text()').extract_first()
    50             if tag == '算法' and not ('算法' in title):
    51                 continue
    52             yield scrapy.Request(url,callback=self.detail_parse,meta={'tag':tag},dont_filter=True)
    53         next_page_url = response.xpath('//a[@id="rtNext"]/@href').extract_first()
    54         if next_page_url is None:
    55             yield scrapy.Request(next_page_url, callback=self.parse,meta={'tag':tag},dont_filter=True)
    56 
    57     def detail_parse(self,response):
    58         item = JobscrawlerQianchengwuyouItem()
    59         item['job_tag'] = response.meta['tag']
    60         item['job_url'] = response.url
    61         item['record_date'] = self.record_date
    62         # 招聘名称、职位信息、薪资、职位福利、经验要求、学历要求
    63         item['job_name'] = response.xpath('//div[@class = "cn"]/h1/text()').extract_first().strip()
    64         item['job_info'] = "".join(response.xpath('//div[@class = "bmsg job_msg inbox"]//text()').extract()).strip()
    65         item['job_salary'] = "".join(response.xpath('//div[@class = "cn"]/strong/text()').extract()).strip()
    66         item['job_welfare'] = ",".join(response.xpath('//span[@class="sp4"]/text()').extract()).strip()
    67         item['job_exp_require'] = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip()
    68         item['job_edu_require'] = response.xpath('//p[@class="msg ltype"]/text()').extract()[2].strip()
    69         # 公司名称、公司行业、公司性质、公司人数、公司地址、公司概况、公司融资阶段
    70         item['company_name'] = response.xpath('//div[@class = "com_msg"]//p/text()').extract_first().strip()
    71         item['company_industry'] = "".join(response.xpath('//span[@class = "i_trade"]/..//text()').extract()).strip()
    72         item['company_nature'] = "".join(response.xpath('//span[@class = "i_flag"]/../text()').extract()).strip()
    73         item['company_people'] = "".join(response.xpath('//span[@class = "i_people"]/../text()').extract()).strip()
    74         item['company_location'] = ""
    75         item['company_overview'] = "".join(response.xpath('//div[@class = "tmsg inbox"]//text()').extract()).strip()
    76         item['company_financing_stage'] = ""
    77         yield item
  • 相关阅读:
    还记得那种 喜欢到不行的感觉么?
    从点到面,再从面到点
    草珊瑚的常见移动网站布局
    草珊瑚的CSS基础
    表驱动编程
    如果一切需要重学,2014年应该学哪些技术?
    揭开Makefile的神秘面纱
    VIM资源
    VIM跳转技巧
    前女友究竟是一种怎样的存在?
  • 原文地址:https://www.cnblogs.com/luochen918/p/11733404.html
Copyright © 2011-2022 走看看