zoukankan      html  css  js  c++  java
  • scarpy crawl 爬取微信小程序文章(将数据通过异步的方式保存的数据库中)

    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    from wxapp.items import WxappItem
    
    
    class WxSpider(CrawlSpider):
        name = 'wx'
        allowed_domains = ['wxapp-union.com']
        start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
    
        rules = (
            Rule(LinkExtractor(allow=r'.*mod=list&catid=2&page=d+'), follow=True),
            Rule(LinkExtractor(allow=r'.*article-.+.html'), callback='parse_detail', follow=False),
        )
    
        def parse_detail(self, response):
            detail_href = response.request.url
            title = response.xpath('//h1[@class="ph"]/text()').get()
            content = response.xpath('//td[@id="article_content"]//text()').getall()
            content = [c.strip() for c in content]
            content = ''.join(content).strip()
            pub_time = response.xpath('//p[@class="authors"]/span/text()').get()
            author = response.xpath('//p[@class="authors"]/a/text()').get()
            item = WxappItem(title=title, content=content, detail_href=detail_href, pub_time=pub_time, author=author)
            yield item

    items:

    class WxAppItem(scrapy.Item):
        title = scrapy.Field()
        pub_time = scrapy.Field()
        content = scrapy.Field()
        summary = scrapy.Field()
        article_url = scrapy.Field()
        read_count = scrapy.Field()

    pipline:

    import pymysql
    from pymysql import cursors
    from twisted.enterprise import adbapi
    
    
    class WxAppPipeline(object):
        def __init__(self):
            db_params = {
                'host': '127.0.0.1',
                'port': 3306,
                'user': 'root',
                'password': '',
                'database': 'wxapp',
                'charset': 'utf8',
                'cursorclass': cursors.DictCursor  # 指定游标类
            }
            # 定义数据库连接池
            self.db_pool = adbapi.ConnectionPool('pymysql', **db_params)
            self._sql = None
    
        def process_item(self, item, spider):
            defer = self.db_pool.runInteraction(self.insert_item, item)
            defer.addErrback(self.handle_error, item, spider)
            return item
    
        def insert_item(self, cursor, item):
            print('kkkkkkkkkkkkkkkkkkkk')
            cursor.execute(self.sql, (item['title'], item['content'], item['summary'], item['read_count'], item['pub_time'], item['article_url']))
    
        def handle_error(self, error, item, spider):
            print('=' * 10 + 'error' + '=' * 10)
            print(error)
    
        @property
        def sql(self):
            if not self._sql:
                self._sql = """
                   INSERT INTO article(id, title, content, summary, read_count, pub_time, article_url) VALUES (null, %s, %s, %s, %s, %s, %s);
                   """
                return self._sql
            return self._sql
  • 相关阅读:
    最大子数组问题:股票
    dfs小练 【dfs】
    java小知识点简单回顾
    cdoj841-休生伤杜景死惊开 (逆序数变形)【线段树 树状数组】
    二路归并排序算法
    优秀Python学习资源收集汇总(强烈推荐)
    怎么学习逆向工程?
    __cdecl 、__fastcall、__stdcall
    getchar()、putchar()、gets()、puts()、cin.get()、cin.getline()、getline()
    <cctype>库
  • 原文地址:https://www.cnblogs.com/yuqiangli0616/p/10338708.html
Copyright © 2011-2022 走看看