zoukankan      html  css  js  c++  java
  • 腾讯招聘网站爬虫

    1. scrapy startproject Qqspider
    2. cd Qqspider
    3. cd Qqspider
    4. ls
    5. vi items.py
    6.   1 # -*- coding: utf-8 -*-
        2 
        3 # Define here the models for your scraped items
        4 #
        5 # See documentation in:
        6 # http://doc.scrapy.org/en/latest/topics/items.html
        7 
        8 import scrapy
        9 
       10 
       11 class QqspiderItem(scrapy.Item):
       12     # define the fields for your item here like:
       13     positionName = scrapy.Field()
       14     positionLink = scrapy.Field()
       15     positionType = scrapy.Field()
       16     peopleNum = scrapy.Field()
       17     workLocation = scrapy.Field()
       18     publishTime = scrapy.Field()
      ~                                          

      7.vi  pipelines.py

    7.   1 # -*- coding: utf-8 -*-
        2 
        3 # Define your item pipelines here
        4 #
        5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
        6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
        7 
        8 import json
        9 class QqspiderPipeline(object):
       10 
       11 
       12     def __init__(self):
       13         self.filename = open("tencent.json", "w")
       14 
       15     def process_item(self, item, spider):
       16         text = json.dumps(dict(item), ensure_ascii = False)
       17         self.filename.write(text.encode("utf-8"))
       18         return item
       19 
       20     def close_spider(self, spider):
       21         self.filename.close()
       22 

      8.vi settings.py

    8.  42 DEFAULT_REQUEST_HEADERS = {
       43     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
       44     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       45 #   'Accept-Language': 'en',
       46 }

      68 ITEM_PIPELINES = {
       69     'QqSpider.pipelines.QqspiderPipeline': 300,
       70 }



      9.cd spiders

    9. scrapy genspider t crawl qqcent tencent.com
    10. vi qqcent.py
    11.  1 # -*- coding: utf-8 -*-
        2 import scrapy
        3 from scrapy.linkextractors import LinkExtractor
        4 from scrapy.spiders import CrawlSpider, Rule
        5 from QqSpider.items import QqspiderItem
        6 
        1 # -*- coding: utf-8 -*-
        2 import scrapy
        3 from scrapy.linkextractors import LinkExtractor
        4 from scrapy.spiders import CrawlSpider, Rule
        5 from QqSpider.items import QqspiderItem
        6 
        7 class QqcentSpider(CrawlSpider):
        8     name = 'qqcent'
        9     allowed_domains = ['tencent.com']
       10     start_urls = ['http://hr.tencent.com/position.php?&start=0#a']
       11     pagelink = LinkExtractor(allow=("start=d+"))
       12     rules = [
       13         Rule(pagelink, callback='parseQqcent', follow=True)
       14     ]
       15 
       16     def parseQqcent(self, response):
       17         #i = {}
       18         #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
       19         #i['name'] = response.xpath('//div[@id="name"]').extract()
       20         #i['description'] = response.xpath('//div[@id="description"]').extract()
       21 
       22         for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
       23             item = QqspiderItem()
       24 
       25             item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
       26             item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0]
       27 
       28             item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
       29 
       30             item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
       31 
       32             item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
       33 
       34             item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
       35 
       36             yield item

      12 scrapy crawl qqcent

    12. vi  tencent.json
  • 相关阅读:
    加入创业公司有什么利弊
    Find Minimum in Rotated Sorted Array II
    Search in Rotated Sorted Array II
    Search in Rotated Sorted Array
    Find Minimum in Rotated Sorted Array
    Remove Duplicates from Sorted Array
    Spiral Matrix
    Spiral Matrix II
    Symmetric Tree
    Rotate Image
  • 原文地址:https://www.cnblogs.com/hizf/p/7834525.html
Copyright © 2011-2022 走看看