需求:
使用crawlSpider(全站)进行数据爬取
- 首页: 岗位名称,岗位类别
- 详情页:岗位职责
- 持久化存储
代码:
爬虫文件:
from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from ..items import CrawlproItem,TenproItem_detail class CrawSpider(CrawlSpider): name = 'craw' # allowed_domains = ['www.xxx.com'] start_urls = ['https://hr.tencent.com/position.php?&start=0#a'] # 首页所有页码的连接提取器 link1 = LinkExtractor(allow=r'&start=d+#a') # 详情页连接提取器 link2 = LinkExtractor(allow=r'position_detail.php?id=d+&keywords=&tid=0&lid=0$') # 问号转义 rules = ( Rule(link1, callback='parse_item', follow=True), Rule(link2, callback='parse_detail', follow=True), ) def parse_item(self, response): # 岗位名称和类别 print('item',response) tr_list = response.xpath('//table[@class="tablelist"]/tr[@class="odd"] | //table[@class="tablelist"]/tr[@class="even"]') for tr in tr_list: job_name = tr.xpath('./td[1]/a/text()').extract_first() job_class = tr.xpath('./td[2]/text()').extract_first() # 实例化item类 item = CrawlproItem() item['job_name'] = job_name item['job_class'] = job_class yield item def parse_detail(self, response): # 岗位职责 desc = response.xpath('//ul[@class="squareli"]/li/text()').extract() desc = ''.join(desc) item = TenproItem_detail() item['desc'] = desc yield item
items.py文件:
import scrapy class CrawlproItem(scrapy.Item): job_name = scrapy.Field() job_class = scrapy.Field() class TenproItem_detail(scrapy.Item): desc = scrapy.Field()
管道文件pipelines.py:
class CrawlproPipeline(object): fp = None def open_spider(self, spider): # 文件只打开一次 self.fp = open('./tenxun.txt', 'w',encoding='utf-8') def process_item(self, item, spider): desc = None # 取出item中的值 if item.__class__.__name__ == 'CrawlproItem': job_name = item["job_name"] job_class = item["job_class"] self.fp.write(f'{job_name} {job_class} ') else: desc = item['desc'] self.fp.write(desc) return item # 返回给下一个即将被执行的管道类 def close_spider(self, spider): self.fp.close()
配置文件中注意开启管道