zoukankan      html  css  js  c++  java
  • scrapy Request方法

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class TestSpider(scrapy.Spider):
        name = 'test'
        allowed_domains = ['yeves.cn']
        start_urls = ['https://yeves.cn/']
        base_domain = 'https://yeves.cn{}'  # 基础域名
        def parse(self, response):
    
            articles = response.xpath('//*[@id="article"]//div') # 获取首页的标题和链接
    
    
            for article in articles:
                title = article.xpath('./div/article/div/header/h2/a/text()').extract_first()
                href = article.xpath('./div/article/div/header/h2/a/@href').extract_first()
                if title is not None and href is not None:
                    href = self.base_domain.format(href)
                    yield scrapy.Request(href,callback=self.parse_detail,meta={"title":title})  #通过标题链接获取详情 把标题带过去
    
        def parse_detail(self,respone):
            print(respone.url)
            print(respone.meta.get('title'))
            detail = {}
            detail['title'] = respone.meta.get('title')
    
            created_at = respone.xpath('/html/body/section/div/div/header/div/span[1]/time/text()').extract_first() # 拿到详情数据
            category = respone.xpath('/html/body/section/div/div/header/div/span[2]/a/text()').extract_first()
            content = respone.xpath('/html/body/section/div/div/article//text()').extract_first()
    
            detail['created_at'] = created_at
            detail['category'] = category
            print(detail)
            yield detail
  • 相关阅读:
    [HNOI2002]营业额统计
    HDU 1374
    HDU 3345
    HDU 2089
    Graham扫描法
    Codeforces 1144D Deduction Queries 并查集
    Codeforces 916E Jamie and Tree 线段树
    Codeforces 1167F Scalar Queries 树状数组
    Codeforces 1167E Range Deleting
    Codeforces 749E Inversions After Shuffle 树状数组 + 数学期望
  • 原文地址:https://www.cnblogs.com/php-linux/p/12522364.html
Copyright © 2011-2022 走看看