zoukankan      html  css  js  c++  java
  • python scrapy 基本操作演示代码


    
    # -*- coding: utf-8 -*-
    import scrapy
    # from quotetutorial.items import QuoteItem
    from quotetutorial.items import QuotetutorialItem
    
    # 主要编辑项目信息基本上都在在这里完成的
    
    class QuotesSpider(scrapy.Spider):
        name = 'quotes'
        allowed_domains = ['quotes.toscrape.com']
        start_urls = ['http://quotes.toscrape.com/']
    
        # 爬取信息
        def parse(self, response):
            # pass
            # 打印源代码
            # print(response.text)
            quotes = response.css('.col-md-8 .quote')
            for quote in quotes:
                item = QuotetutorialItem()
                text = quote.css('.text::text').extract_first()
                author = quote.css('.author::text').extract_first() # 只提取一个内容 类似于 findone
                tags = quote.css('.tags .tag::text').extract() # 提多多个内容 类似于 findall
                item['text'] = text
                item['author'] = author
                item['tags'] = tags
                yield item
            next = response.css('.pager .netxt a::attr(href)').extract_first() # 选择下一页
            url = response.urljoin(next) # 因为获取的地址不完整,获取完整的网址加内容连接地址
            yield scrapy.Request(url=url,callback=self.parse()) # 从新调用自己并翻页
    # 保存文件
    # scrapy crawl quotes -o quotes.json
    # scrapy crawl quotes -o quotes.jl
    # scrapy crawl quotes -o quotes.csv
    # scrapy crawl quotes -o quotes.xml
    # scrapy crawl quotes -o ftp://user:pass@ftp.example.com/path/quotes.csv
    
    
    
  • 相关阅读:
    【转】JSP三种页面跳转方式
    我要从头做起
    转载:用 Tomcat 和 Eclipse 开发 Web 应用程序
    html的style属性
    Java连接oracle数据库
    tomcat遇到的问题(总结)
    ceshi
    今天要小结一下
    argument.callee.caller.arguments[0]与window.event
    JavaScript事件冒泡简介及应用
  • 原文地址:https://www.cnblogs.com/wordgao/p/9824658.html
Copyright © 2011-2022 走看看