zoukankan      html  css  js  c++  java
  • 21天打造分布式爬虫-Crawl类爬取小程序社区(八)

    8.1.Crawl的用法实战

    新建项目

    scrapy startproject wxapp
    
    scrapy genspider -t crawl wxapp_spider "wxapp-union.com"

    wxapp_spider.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from wxapp.items import WxappItem
    
    class WxappSpiderSpider(CrawlSpider):
        name = 'wxapp_spider'
        allowed_domains = ['wxapp-union.com']
        start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
    
        rules = (
            Rule(LinkExtractor(allow=r'.+mod=list&catid=d'), follow=True),
            Rule(LinkExtractor(allow=r'.+article-.+.html'), callback="parse_detail",follow=False),
        )
    
        def parse_detail(self, response):
            title = response.xpath("//h1[@class='ph']/text()").get()
            author_p = response.xpath("//p[@class='authors']")
            author = author_p.xpath(".//a/text()").get()
            pub_time = author_p.xpath(".//span/text()").get()
            article_content = response.xpath("//td[@id='article_content']//text()").getall()
            content = "".join(article_content).strip()
            item = WxappItem(title=title,author=author,pub_time=pub_time,content=content)
            return item

    items.py

    # -*- coding: utf-8 -*-
    
    
    import scrapy
    
    class WxappItem(scrapy.Item):
        title = scrapy.Field()
        author = scrapy.Field()
        pub_time = scrapy.Field()
        content = scrapy.Field()

    pipelines.py

    # -*- coding: utf-8 -*-
    
    from scrapy.exporters import JsonLinesItemExporter
    
    class WxappPipeline(object):
        def __init__(self):
            self.fp = open('wxapp.json','wb')
            self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')
    
        def process_item(self, item, spider):
            self.exporter.export_item(item)
            return item
    
        def close_spider(self, spider):
            self.fp.close()

    settings.py

    ROBOTSTXT_OBEY = False
    
    DOWNLOAD_DELAY = 1
    
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
    }
    
    ITEM_PIPELINES = {
       'wxapp.pipelines.WxappPipeline': 300,
    }

    start.py

    from scrapy import cmdline
    
    cmdline.execute("scrapy crawl wxapp_spider".split())
  • 相关阅读:
    Jstorm执行task报错windows CONFIG SET protected-mode no
    windows搭建redis集群最佳实践
    windows下golang实现Kfaka消息发送及kafka环境搭建
    go报错unimplemented: 64-bit mode not compiled in与mingw 64位安装报错ERROR res已解决
    GoLand配置数据库、远程host以及远程调试
    Go项目中beego的orm使用和gorm的使用
    windows下Go升级及GoLand的安装激活
    记一次解脱
    golang开源项目qor快速搭建网站qor-example运行实践
    使用img2html把图片转为网页
  • 原文地址:https://www.cnblogs.com/derek1184405959/p/9425234.html
Copyright © 2011-2022 走看看