zoukankan      html  css  js  c++  java
  • scarpy crawl 爬取微信小程序文章

    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    from wxapp.items import WxappItem
    
    
    class WxSpider(CrawlSpider):
        name = 'wx'
        allowed_domains = ['wxapp-union.com']
        start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
    
        rules = (
            Rule(LinkExtractor(allow=r'.*mod=list&catid=2&page=d+'), follow=True),
            Rule(LinkExtractor(allow=r'.*article-.+.html'), callback='parse_detail', follow=False),
        )
    
        def parse_detail(self, response):
            detail_href = response.request.url
            title = response.xpath('//h1[@class="ph"]/text()').get()
            content = response.xpath('//td[@id="article_content"]//text()').getall()
            content = [c.strip() for c in content]
            content = ''.join(content).strip()
            pub_time = response.xpath('//p[@class="authors"]/span/text()').get()
            author = response.xpath('//p[@class="authors"]/a/text()').get()
            item = WxappItem(title=title, content=content, detail_href=detail_href, pub_time=pub_time, author=author)
            yield item
    from scrapy.exporters import JsonLinesItemExporter, JsonItemExporter
    
    
    class WxappPipeline(object):
        def __init__(self):
            """
            爬虫开始的时候执行
            """
            self.fp = open("data.json", 'wb')
            self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')
    
        def open_spider(self, spider):
            """
            爬虫开始的时候执行
            :param spider:
            :return:
            """
            pass
    
        def process_item(self, item, spider):
            self.exporter.export_item(item)
            return item
    
        def close_spider(self, spider):
            """
            爬虫结束的时候执行
            :param spider:
            :return:
            """
            self.fp.close()
    import scrapy
    
    
    class WxappItem(scrapy.Item):
        title = scrapy.Field()
        content = scrapy.Field()
        pub_time = scrapy.Field()
        author = scrapy.Field()
        detail_href = scrapy.Field()
  • 相关阅读:
    备份
    Ibatis_dataMapper
    查询成绩都大于80分的学生
    删除文件的工具
    从运行中启动收索引擎
    数据库Northwind
    搭建Android开发环境
    数据库知识结构
    数据库MedicineMis_STD
    数据库work
  • 原文地址:https://www.cnblogs.com/yuqiangli0616/p/10338671.html
Copyright © 2011-2022 走看看