zoukankan      html  css  js  c++  java
  • Scrapy Item用法示例(保存item到MySQL数据库,MongoDB数据库,使用官方组件下载图片)

    需要学习的地方:

    保存item到MySQL数据库,MongoDB数据库,下载图片

    1.爬虫文件images.py

    # -*- coding: utf-8 -*-
    from scrapy import Spider, Request
    from urllib.parse import urlencode
    import json
    
    from images360.items import ImageItem
    
    
    class ImagesSpider(Spider):
        name = 'images'
        allowed_domains = ['images.so.com']
        start_urls = ['http://images.so.com/']
       
        def start_requests(self):
            data = {'ch': 'photography', 'listtype': 'new'}
            base_url = 'https://image.so.com/zj?'
            for page in range(1, self.settings.get('MAX_PAGE') + 1):
                data['sn'] = page * 30
                params = urlencode(data)
                url = base_url + params
                yield Request(url, self.parse)
        
        def parse(self, response):
            result = json.loads(response.text)
            for image in result.get('list'):
                item = ImageItem()
                item['id'] = image.get('imageid')
                item['url'] = image.get('qhimg_url')
                item['title'] = image.get('group_title')
                item['thumb'] = image.get('qhimg_thumb_url')
                yield item

    2.items.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/items.html
    
    from scrapy import Item, Field
    
    
    class ImageItem(Item):
        collection = table = 'images'
        
        id = Field()
        url = Field()
        title = Field()
        thumb = Field()

    3.pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    import pymongo
    import pymysql
    from scrapy import Request
    from scrapy.exceptions import DropItem
    from scrapy.pipelines.images import ImagesPipeline
    
    
    class MongoPipeline(object):
        def __init__(self, mongo_uri, mongo_db):
            self.mongo_uri = mongo_uri
            self.mongo_db = mongo_db
        
        @classmethod
        def from_crawler(cls, crawler):
            return cls(
                mongo_uri=crawler.settings.get('MONGO_URI'),
                mongo_db=crawler.settings.get('MONGO_DB')
            )
        
        def open_spider(self, spider):
            self.client = pymongo.MongoClient(self.mongo_uri)
            self.db = self.client[self.mongo_db]
        
        def process_item(self, item, spider):
            name = item.collection
            self.db[name].insert(dict(item))
            return item
        
        def close_spider(self, spider):
            self.client.close()
    
    
    class MysqlPipeline():
        def __init__(self, host, database, user, password, port):
            self.host = host
            self.database = database
            self.user = user
            self.password = password
            self.port = port
        
        @classmethod
        def from_crawler(cls, crawler):
            return cls(
                host=crawler.settings.get('MYSQL_HOST'),
                database=crawler.settings.get('MYSQL_DATABASE'),
                user=crawler.settings.get('MYSQL_USER'),
                password=crawler.settings.get('MYSQL_PASSWORD'),
                port=crawler.settings.get('MYSQL_PORT'),
            )
        
        def open_spider(self, spider):
            self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8',
                                      port=self.port)
            self.cursor = self.db.cursor()
        
        def close_spider(self, spider):
            self.db.close()
        
        def process_item(self, item, spider):
            print(item['title'])
            data = dict(item)
            keys = ', '.join(data.keys())
            values = ', '.join(['%s'] * len(data))
            sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values)
            self.cursor.execute(sql, tuple(data.values()))
            self.db.commit()
            return item
    
    
    class ImagePipeline(ImagesPipeline):
        def file_path(self, request, response=None, info=None):
            url = request.url
            file_name = url.split('/')[-1]
            return file_name
        
        def item_completed(self, results, item, info):
            image_paths = [x['path'] for ok, x in results if ok]
            if not image_paths:
                raise DropItem('Image Downloaded Failed')
            return item
        
        def get_media_requests(self, item, info):
            yield Request(item['url'])

    4.settings.py

    配置文件中增加如下内容

    ITEM_PIPELINES = {
        'images360.pipelines.ImagePipeline': 300,
        'images360.pipelines.MongoPipeline': 301,
        'images360.pipelines.MysqlPipeline': 302,
    }
    
    IMAGES_STORE = './images'
    
    MAX_PAGE = 50
    
    MONGO_URI = 'localhost'
    MONGO_DB = 'images360'
    
    MYSQL_HOST = 'localhost'
    MYSQL_DATABASE = 'images360'
    MYSQL_USER = 'root'
    MYSQL_PASSWORD = '123456'
    MYSQL_PORT = 3306

    代码下载地址:https://files.cnblogs.com/files/sanduzxcvbnm/Images360-master.7z

  • 相关阅读:
    MFC Windows 程序设计>WinMain 简单Windows程序 命令行编译
    AT3949 [AGC022D] Shopping 题解
    CF643D Bearish Fanpages 题解
    CF643C Levels and Regions 题解
    CF241E Flights 题解
    CF671C Ultimate Weirdness of an Array 题解
    CF1592F Alice and Recoloring 题解
    GYM 102452E 题解
    CF494C Helping People 题解
    P5556 圣剑护符
  • 原文地址:https://www.cnblogs.com/sanduzxcvbnm/p/10345653.html
Copyright © 2011-2022 走看看