scrapy多个page的爬取
import scrapy
from bossPro.items import BossproItem
class BossSpider(scrapy.Spider):
name = 'boss'
# allowed_domains = ['www.xxx.com']
start_urls = [
'https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&scity=101010100&industry=&position=']
url = 'https://www.zhipin.com/c101010100/?query=python爬虫&page=%d&ka=page-2'
page = 1
# 解析+管道持久化存储
def parse(self, response):
li_list = response.xpath('//div[@class="job-list"]/ul/li')
for li in li_list:
job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div/text()').extract_first()
salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first()
company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()
# 实例化一个item对象
item = BossproItem()
# 将解析到的数据全部封装到item对象中
item['job_name'] = job_name
item['salary'] = salary
item['company'] = company
# 将item提交给管道
yield item
if self.page <= 3:
print('if 执行!!!')
self.page += 1
new_url = format(self.url % self.page)
print(new_url)
# 手动请求发送
yield scrapy.Request(url=new_url, callback=self.parse)
scrapy post请求
import scrapy
from scrapy1.items import Scrapy1Item
class MyspiderSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://fanyi.baidu.com/sug']
data = {'kw': 'cat'}
def start_requests(self):
for url in self.start_urls:
yield scrapy.FormRequest(url=url, formdata=self.data, callback=self.parse)
def parse(self, response):
item = Scrapy1Item()
item['title'] = 'cat'
item['content'] = response.text
yield item
scrapy通过爬到的URL继续发请求爬页面
import scrapy
from scrapy1.items import Scrapy1Item
class MyspiderSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://www.4567tv.tv/frim/index1.html']
def get_detail(self, response):
item = response.meta['item']
detail = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
item['content'] = detail
yield item
def parse(self, response):
div_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]')
# print(div_list)
for li in div_list:
item = Scrapy1Item()
name = li.xpath('./div/a/@title').extract_first()
href = 'https://www.4567tv.tv' + li.xpath('./div/a/@href').extract_first()
item['title'] = name
yield scrapy.Request(url=href, callback=self.get_detail, meta={'item':item})