zoukankan      html  css  js  c++  java
  • Scrapy的基本使用

    爬取:http://quotes.toscrape.com

    单页面

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class QuoteSpider(scrapy.Spider):
        name = 'quote'
        allowed_domains = ['quotes.toscrape.com']
        start_urls = ['http://quotes.toscrape.com/']
        """
        知识点
            1. text()获取标签的text
            2. @属性  获取属性的值
            3. extract()查找多个    extract_first() 查找一个
        """
        def parse(self, response):
            # print(response.text)
            quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]')
            # print(quotes)''
            for quote in quotes:
                print('=' * 20)
                # print(quote)
                # extract_first() 查找一个
                text = quote.xpath('.//span[@class="text"]/text()').extract_first()
                print(text)
                author = quote.xpath('.//span/small[@class="author"]/text()').extract_first()
                print(author)
                # extract()查找多个
                tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract()
                print(tags)

    所有页面

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class QuoteSpider(scrapy.Spider):
        name = 'quote'
        allowed_domains = ['quotes.toscrape.com']
        start_urls = ['http://quotes.toscrape.com/']
        """
        知识点
            1. text()获取标签的text
            2. @属性  获取属性的值
            3. extract()查找多个    extract_first() 查找一个
            4. response.urljoin()     url拼接
            5. scrapy.Request(url=_next, callback=self.parse)   回调
        """
        def parse(self, response):
            # print(response.text)
            quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]')
            # print(quotes)''
            for quote in quotes:
                print('=' * 20)
                # print(quote)
                # extract_first() 查找一个
                text = quote.xpath('.//span[@class="text"]/text()').extract_first()
                print(text)
                author = quote.xpath('.//span/small[@class="author"]/text()').extract_first()
                print(author)
                # extract()查找多个
                tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract()
                print(tags)
            print('>' * 40)
            next_url = response.xpath('//div[@class="col-md-8"]/nav/ul[@class="pager"]/li[@class="next"]/a/@href').extract_first()
            print(next_url)
            # 拼接url
            _next = response.urljoin(next_url)
            print(_next)
            # callback 回调函数 
            yield scrapy.Request(url=_next, callback=self.parse)

    补充

    from scrapy import Spider, FormRequest
    FormRequest(ulr= '', callback='', formdata='')
  • 相关阅读:
    Python文件File方法
    python的slice notation的特殊用法
    函数
    字典
    python 笔记7
    列表解析与生成器
    [CenOS7][Mac] MAC环境中dubbo连接zookeeper超时
    [JAVA][Thread] 实现Runnable接口和继承Thread类创建线程哪种方式更好?
    [Java] String字符常量类型作为参数传递的一些问题
    [JAVA]hashCode()和identityHashCode()的区别
  • 原文地址:https://www.cnblogs.com/wt7018/p/11729534.html
Copyright © 2011-2022 走看看