zoukankan      html  css  js  c++  java
  • Scrapy的基本使用

    爬取:http://quotes.toscrape.com

    单页面

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class QuoteSpider(scrapy.Spider):
        name = 'quote'
        allowed_domains = ['quotes.toscrape.com']
        start_urls = ['http://quotes.toscrape.com/']
        """
        知识点
            1. text()获取标签的text
            2. @属性  获取属性的值
            3. extract()查找多个    extract_first() 查找一个
        """
        def parse(self, response):
            # print(response.text)
            quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]')
            # print(quotes)''
            for quote in quotes:
                print('=' * 20)
                # print(quote)
                # extract_first() 查找一个
                text = quote.xpath('.//span[@class="text"]/text()').extract_first()
                print(text)
                author = quote.xpath('.//span/small[@class="author"]/text()').extract_first()
                print(author)
                # extract()查找多个
                tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract()
                print(tags)

    所有页面

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class QuoteSpider(scrapy.Spider):
        name = 'quote'
        allowed_domains = ['quotes.toscrape.com']
        start_urls = ['http://quotes.toscrape.com/']
        """
        知识点
            1. text()获取标签的text
            2. @属性  获取属性的值
            3. extract()查找多个    extract_first() 查找一个
            4. response.urljoin()     url拼接
            5. scrapy.Request(url=_next, callback=self.parse)   回调
        """
        def parse(self, response):
            # print(response.text)
            quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]')
            # print(quotes)''
            for quote in quotes:
                print('=' * 20)
                # print(quote)
                # extract_first() 查找一个
                text = quote.xpath('.//span[@class="text"]/text()').extract_first()
                print(text)
                author = quote.xpath('.//span/small[@class="author"]/text()').extract_first()
                print(author)
                # extract()查找多个
                tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract()
                print(tags)
            print('>' * 40)
            next_url = response.xpath('//div[@class="col-md-8"]/nav/ul[@class="pager"]/li[@class="next"]/a/@href').extract_first()
            print(next_url)
            # 拼接url
            _next = response.urljoin(next_url)
            print(_next)
            # callback 回调函数 
            yield scrapy.Request(url=_next, callback=self.parse)

    补充

    from scrapy import Spider, FormRequest
    FormRequest(ulr= '', callback='', formdata='')
  • 相关阅读:
    111.浮动初识 Walker
    105.灰度和对比度 Walker
    102.表格属性 Walker
    POJ 1321 棋盘问题
    HDU 1106 排序 题解
    HDU 1240 Asteroids! 解题报告
    HDU 1372 Knight Moves
    HDU 1253 胜利大逃亡
    HDU 1231:最大连续子序列 解题报告
    POJ 2251 Dungeon Master
  • 原文地址:https://www.cnblogs.com/wt7018/p/11729534.html
Copyright © 2011-2022 走看看