- scrapy startproject Qqspider
- cd Qqspider
- cd Qqspider
- ls
- vi items.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class QqspiderItem(scrapy.Item): 12 # define the fields for your item here like: 13 positionName = scrapy.Field() 14 positionLink = scrapy.Field() 15 positionType = scrapy.Field() 16 peopleNum = scrapy.Field() 17 workLocation = scrapy.Field() 18 publishTime = scrapy.Field() ~
7.vi pipelines.py
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 8 import json 9 class QqspiderPipeline(object): 10 11 12 def __init__(self): 13 self.filename = open("tencent.json", "w") 14 15 def process_item(self, item, spider): 16 text = json.dumps(dict(item), ensure_ascii = False) 17 self.filename.write(text.encode("utf-8")) 18 return item 19 20 def close_spider(self, spider): 21 self.filename.close() 22
8.vi settings.py
42 DEFAULT_REQUEST_HEADERS = { 43 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 44 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 # 'Accept-Language': 'en', 46 }
69 'QqSpider.pipelines.QqspiderPipeline': 300,
70 }9.cd spiders
- scrapy genspider t crawl qqcent tencent.com
- vi qqcent.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import CrawlSpider, Rule 5 from QqSpider.items import QqspiderItem 6 1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import CrawlSpider, Rule 5 from QqSpider.items import QqspiderItem 6 7 class QqcentSpider(CrawlSpider): 8 name = 'qqcent' 9 allowed_domains = ['tencent.com'] 10 start_urls = ['http://hr.tencent.com/position.php?&start=0#a'] 11 pagelink = LinkExtractor(allow=("start=d+")) 12 rules = [ 13 Rule(pagelink, callback='parseQqcent', follow=True) 14 ] 15 16 def parseQqcent(self, response): 17 #i = {} 18 #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() 19 #i['name'] = response.xpath('//div[@id="name"]').extract() 20 #i['description'] = response.xpath('//div[@id="description"]').extract() 21 22 for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): 23 item = QqspiderItem() 24 25 item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0] 26 item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0] 27 28 item['positionType'] = each.xpath("./td[2]/text()").extract()[0] 29 30 item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] 31 32 item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] 33 34 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] 35 36 yield item
12 scrapy crawl qqcent
- vi tencent.json