zoukankan      html  css  js  c++  java
  • scarpy crawl 爬取微信小程序文章

    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    from wxapp.items import WxappItem
    
    
    class WxSpider(CrawlSpider):
        name = 'wx'
        allowed_domains = ['wxapp-union.com']
        start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
    
        rules = (
            Rule(LinkExtractor(allow=r'.*mod=list&catid=2&page=d+'), follow=True),
            Rule(LinkExtractor(allow=r'.*article-.+.html'), callback='parse_detail', follow=False),
        )
    
        def parse_detail(self, response):
            detail_href = response.request.url
            title = response.xpath('//h1[@class="ph"]/text()').get()
            content = response.xpath('//td[@id="article_content"]//text()').getall()
            content = [c.strip() for c in content]
            content = ''.join(content).strip()
            pub_time = response.xpath('//p[@class="authors"]/span/text()').get()
            author = response.xpath('//p[@class="authors"]/a/text()').get()
            item = WxappItem(title=title, content=content, detail_href=detail_href, pub_time=pub_time, author=author)
            yield item
    from scrapy.exporters import JsonLinesItemExporter, JsonItemExporter
    
    
    class WxappPipeline(object):
        def __init__(self):
            """
            爬虫开始的时候执行
            """
            self.fp = open("data.json", 'wb')
            self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')
    
        def open_spider(self, spider):
            """
            爬虫开始的时候执行
            :param spider:
            :return:
            """
            pass
    
        def process_item(self, item, spider):
            self.exporter.export_item(item)
            return item
    
        def close_spider(self, spider):
            """
            爬虫结束的时候执行
            :param spider:
            :return:
            """
            self.fp.close()
    import scrapy
    
    
    class WxappItem(scrapy.Item):
        title = scrapy.Field()
        content = scrapy.Field()
        pub_time = scrapy.Field()
        author = scrapy.Field()
        detail_href = scrapy.Field()
  • 相关阅读:
    连续多步骤业务流程的暂停、中断和恢复
    什么是XML
    泛型擦除和反射配置文件
    类加载器和反射
    网络安全协议(二)
    网络通信协议(一)
    多线程之线程安全
    JAVA之线程池
    JAVA之多线程
    2020/8/1 JAVA之IO流(四)
  • 原文地址:https://www.cnblogs.com/yuqiangli0616/p/10338671.html
Copyright © 2011-2022 走看看