# -*- coding: utf-8 -*- import scrapy class TestSpider(scrapy.Spider): name = 'test' allowed_domains = ['yeves.cn'] start_urls = ['https://yeves.cn/'] base_domain = 'https://yeves.cn{}' # 基础域名 def parse(self, response): articles = response.xpath('//*[@id="article"]//div') # 获取首页的标题和链接 for article in articles: title = article.xpath('./div/article/div/header/h2/a/text()').extract_first() href = article.xpath('./div/article/div/header/h2/a/@href').extract_first() if title is not None and href is not None: href = self.base_domain.format(href) yield scrapy.Request(href,callback=self.parse_detail,meta={"title":title}) #通过标题链接获取详情 把标题带过去 def parse_detail(self,respone): print(respone.url) print(respone.meta.get('title')) detail = {} detail['title'] = respone.meta.get('title') created_at = respone.xpath('/html/body/section/div/div/header/div/span[1]/time/text()').extract_first() # 拿到详情数据 category = respone.xpath('/html/body/section/div/div/header/div/span[2]/a/text()').extract_first() content = respone.xpath('/html/body/section/div/div/article//text()').extract_first() detail['created_at'] = created_at detail['category'] = category print(detail) yield detail