zoukankan      html  css  js  c++  java
  • Scrapy的基本使用

    爬取:http://quotes.toscrape.com

    单页面

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class QuoteSpider(scrapy.Spider):
        name = 'quote'
        allowed_domains = ['quotes.toscrape.com']
        start_urls = ['http://quotes.toscrape.com/']
        """
        知识点
            1. text()获取标签的text
            2. @属性  获取属性的值
            3. extract()查找多个    extract_first() 查找一个
        """
        def parse(self, response):
            # print(response.text)
            quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]')
            # print(quotes)''
            for quote in quotes:
                print('=' * 20)
                # print(quote)
                # extract_first() 查找一个
                text = quote.xpath('.//span[@class="text"]/text()').extract_first()
                print(text)
                author = quote.xpath('.//span/small[@class="author"]/text()').extract_first()
                print(author)
                # extract()查找多个
                tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract()
                print(tags)

    所有页面

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class QuoteSpider(scrapy.Spider):
        name = 'quote'
        allowed_domains = ['quotes.toscrape.com']
        start_urls = ['http://quotes.toscrape.com/']
        """
        知识点
            1. text()获取标签的text
            2. @属性  获取属性的值
            3. extract()查找多个    extract_first() 查找一个
            4. response.urljoin()     url拼接
            5. scrapy.Request(url=_next, callback=self.parse)   回调
        """
        def parse(self, response):
            # print(response.text)
            quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]')
            # print(quotes)''
            for quote in quotes:
                print('=' * 20)
                # print(quote)
                # extract_first() 查找一个
                text = quote.xpath('.//span[@class="text"]/text()').extract_first()
                print(text)
                author = quote.xpath('.//span/small[@class="author"]/text()').extract_first()
                print(author)
                # extract()查找多个
                tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract()
                print(tags)
            print('>' * 40)
            next_url = response.xpath('//div[@class="col-md-8"]/nav/ul[@class="pager"]/li[@class="next"]/a/@href').extract_first()
            print(next_url)
            # 拼接url
            _next = response.urljoin(next_url)
            print(_next)
            # callback 回调函数 
            yield scrapy.Request(url=_next, callback=self.parse)

    补充

    from scrapy import Spider, FormRequest
    FormRequest(ulr= '', callback='', formdata='')
  • 相关阅读:
    【Rust】结构体 struct
    【Rust】所有权、引用、借用
    Centos7升级glibc2.24
    ES用户权限控制
    PHP压缩html网页代码 : 清除空格,制表符,注释标记
    PHP通过HTTP_USER_AGENT判断是否为手机移动终端的函数
    php正则表达式替换URL链接地址为指定url的形式
    PHP下使用CURL方式POST数据至API接口的方法
    设计模式代理模式
    设计模式生成器模式
  • 原文地址:https://www.cnblogs.com/wt7018/p/11729534.html
Copyright © 2011-2022 走看看