项目主代码
1 import scrapy 2 from boss.items import BossItem 3 4 class BossproSpider(scrapy.Spider): 5 name = 'bossPro' 6 # allowed_domains = ['www.baidu.com'] 7 url = 'https://www.zhipin.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=' 8 start_urls = ['https://www.zhipin.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=1'] 9 pageNum=1 10 11 def parse(self, response): 12 li_list=response.xpath('//div[@class="job-list"]/ul/li') 13 for li in li_list: 14 job_name=li.xpath('./div/div[1]/h3/a/div[1]/text()').extract_first() 15 salary=li.xpath('./div/div[1]/h3/a/span/text()').extract_first() 16 item=BossItem() 17 item['job_name']=job_name 18 item['salary']=salary 19 20 yield item 21 22 #通过循环的形式请求其他页码数据 23 if self.pageNum <=3: 24 self.pageNum+=1#因为第一页的数据已经请求,所有从第二页请求 25 26 # 参数2:回调函数,作用就是对请求到的页面数据进行解析操作 27 new_url=self.url+str(self.pageNum) 28 29 # yield的使用场景:1.向管道提交item 2.手动请求发送 30 yield scrapy.Request(url=new_url,callback=self.parse)
items代码:
1 import scrapy 2 3 4 class BossItem(scrapy.Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 job_name=scrapy.Field() 8 salary=scrapy.Field()
pipelines代码:
1 class BossPipeline(object): 2 3 fp=None 4 def open_spider(self,spider): 5 self.fp=open('./job.txt','w',encoding='utf-8') 6 def process_item(self, item, spider): 7 self.fp.write(item['job_name']+":"+item['salary']+' ') 8 return item 9 def close_spider(self,spider): 10 self.fp.close()
setting配置代码
1 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' 2 3 4 ROBOTSTXT_OBEY = False 5 6 7 ITEM_PIPELINES = { 8 'boss.pipelines.BossPipeline': 300, 9 }