zoukankan html css js c++ java

腾讯招聘网站爬虫

scrapy startproject Qqspider
cd Qqspider
cd Qqspider
ls
vi items.py

  1 # -*- coding: utf-8 -*-
  2 
  3 # Define here the models for your scraped items
  4 #
  5 # See documentation in:
  6 # http://doc.scrapy.org/en/latest/topics/items.html
  7 
  8 import scrapy
  9 
 10 
 11 class QqspiderItem(scrapy.Item):
 12     # define the fields for your item here like:
 13     positionName = scrapy.Field()
 14     positionLink = scrapy.Field()
 15     positionType = scrapy.Field()
 16     peopleNum = scrapy.Field()
 17     workLocation = scrapy.Field()
 18     publishTime = scrapy.Field()
~

7.vi pipelines.py

  1 # -*- coding: utf-8 -*-
  2 
  3 # Define your item pipelines here
  4 #
  5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 
  8 import json
  9 class QqspiderPipeline(object):
 10 
 11 
 12     def __init__(self):
 13         self.filename = open("tencent.json", "w")
 14 
 15     def process_item(self, item, spider):
 16         text = json.dumps(dict(item), ensure_ascii = False)
 17         self.filename.write(text.encode("utf-8"))
 18         return item
 19 
 20     def close_spider(self, spider):
 21         self.filename.close()
 22

8.vi settings.py

 42 DEFAULT_REQUEST_HEADERS = {
 43     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
 44     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 45 #   'Accept-Language': 'en',
 46 }

 68 ITEM_PIPELINES = {
 69     'QqSpider.pipelines.QqspiderPipeline': 300,
 70 }

9.cd spiders

scrapy genspider t crawl qqcent tencent.com
vi qqcent.py

 1 # -*- coding: utf-8 -*-
  2 import scrapy
  3 from scrapy.linkextractors import LinkExtractor
  4 from scrapy.spiders import CrawlSpider, Rule
  5 from QqSpider.items import QqspiderItem
  6 
  1 # -*- coding: utf-8 -*-
  2 import scrapy
  3 from scrapy.linkextractors import LinkExtractor
  4 from scrapy.spiders import CrawlSpider, Rule
  5 from QqSpider.items import QqspiderItem
  6 
  7 class QqcentSpider(CrawlSpider):
  8     name = 'qqcent'
  9     allowed_domains = ['tencent.com']
 10     start_urls = ['http://hr.tencent.com/position.php?&start=0#a']
 11     pagelink = LinkExtractor(allow=("start=d+"))
 12     rules = [
 13         Rule(pagelink, callback='parseQqcent', follow=True)
 14     ]
 15 
 16     def parseQqcent(self, response):
 17         #i = {}
 18         #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
 19         #i['name'] = response.xpath('//div[@id="name"]').extract()
 20         #i['description'] = response.xpath('//div[@id="description"]').extract()
 21 
 22         for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
 23             item = QqspiderItem()
 24 
 25             item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
 26             item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0]
 27 
 28             item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
 29 
 30             item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
 31 
 32             item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
 33 
 34             item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
 35 
 36             yield item

12 scrapy crawl qqcent

vi tencent.json

查看全文

相关阅读:
加入创业公司有什么利弊
 Find Minimum in Rotated Sorted Array II
Search in Rotated Sorted Array II
Search in Rotated Sorted Array
Find Minimum in Rotated Sorted Array
Remove Duplicates from Sorted Array
Spiral Matrix
Spiral Matrix II
Symmetric Tree
Rotate Image

原文地址：https://www.cnblogs.com/hizf/p/7834525.html