zoukankan      html  css  js  c++  java
  • Scrapy 中常用的中间件和管道组件

    Pipeline用法

    储存到MongoDB

    pipline.py中的代码

    import pymongo
    
    
    class MongoPipeline(object):
        def __init__(self, mongo_uri, mongo_db):
            self.mongo_uri = mongo_uri
            self.mongo_db = mongo_db
    
        @classmethod
        def from_crawler(cls, crawler):
            return cls(
                mongo_uri=crawler.settings.get('MONGO_URI'),
                mongo_db=crawler.settings.get('MONGO_DB')
            )
    
        def open_spider(self, spider):
            self.client = pymongo.MongoClient(self.mongo_uri)
            self.db = self.client[self.mongo_db]
    
        def process_item(self, item, spider):
            name = item.collection
            self.db[name].insert(dict(item))
            return item
    
        def close_spider(self, spider):
            self.client.close()
    

    settings配置

    MONGO_URI = 'localhost'  # 若为远程MongoDB,需更改地址
    MONGO_DB = 'images360'
    
    ITEM_PIPELINES = {
            'images360.pipelines.MongoPipeline': 301,
    }
    

    存储到MySQL

    pipeline 文件配置

    import pymysql
    
    
    class MysqlPipeline():
        def __init__(self, host, database, user, password, port):
            self.host = host
            self.database = database
            self.user = user
            self.password = password
            self.port = port
    
        @classmethod
        def from_crawler(cls, crawler):
            return cls(
                host=crawler.settings.get('MYSQL_HOST'),
                database=crawler.settings.get('MYSQL_DATABASE'),
                user=crawler.settings.get('MYSQL_USER'),
                password=crawler.settings.get('MYSQL_PASSWORD'),
                port=crawler.settings.get('MYSQL_PORT'),
            )
    
        def open_spider(self, spider):
            self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8',
                                      port=self.port)
            self.cursor = self.db.cursor()
    
        def close_spider(self, spider):
            self.db.close()
    
        def process_item(self, item, spider):
            print(item['title'])
            data = dict(item)
            keys = ', '.join(data.keys())
            values = ', '.join(['%s'] * len(data))
            sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values)  # 动态构造数据,需先在MySQL里面创建好。
            self.cursor.execute(sql, tuple(data.values()))
            self.db.commit()
            return item
    

    setting配置

    MYSQL_HOST = 'localhost'  # 远程连接的话,要改成该主机数据库
    MYSQL_DATABASE = 'images360'
    MYSQL_USER = 'root'
    MYSQL_PASSWORD = 'root'
    MYSQL_PORT = 3306
    
    ITEM_PIPELINES = {
            'images360.pipelines.MysqlPipeline': 302,
    }
    

    图片储存

    pipline

    from scrapy import Request
    from scrapy.exceptions import DropItem
    from scrapy.pipelines.images import ImagesPipeline
    
    
    class ImagePipeline(ImagesPipeline):
        def file_path(self, request, response=None, info=None):
            url = request.url
            file_name = url.split('/')[-1]
            return file_name
    
        def item_completed(self, results, item, info):
            image_paths = [x['path'] for ok, x in results if ok]
            if not image_paths:
                raise DropItem('Image Downloaded Failed')
            return item
    
        def get_media_requests(self, item, info):
            yield Request(item['url'])
    

    settings配置

    MYSQL_HOST = 'localhost'  # 远程连接的话,要改成该主机数据库
    MYSQL_DATABASE = 'images360'
    MYSQL_USER = 'root'
    MYSQL_PASSWORD = 'root'
    MYSQL_PORT = 3306
    
    ITEM_PIPELINES = {
            'images360.pipelines.ImagePipeline': 300,
    }
    

    中间件使用

    随机请求头

    import random
    
    
    class RandomUserAgentMiddleware():
        def __init__(self):
            self.user_agents = [
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
                'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)',
                'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2',
                'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1',
                'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
                'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
                'ozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0',
                "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
                "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
                "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
                "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
                "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
                "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
                "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
                "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
                "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
                "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
                "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
            ]
    
        def process_request(self, request, spider):
            request.headers['User-Agent'] = random.choice(self.user_agents)
    

    settings配置

    DOWNLOADER_MIDDLEWARES = {
        'fangtianxia.middlewares.RandomUserAgentMiddleware': 545,
    }
    

    设置随机代理池

    import logging
    import requests
    
    
    class ProxyMiddleware():
        def __init__(self, proxy_url):
            self.logger = logging.getLogger(__name__)
            self.proxy_url = proxy_url
    
        def get_random_proxy(self):
            try:
                response = requests.get(self.proxy_url)
                if response.status_code == 200:
                    proxy = response.text
                    return proxy
            except requests.ConnectionError:
                return False
    
        def process_request(self, request, spider):
            if request.meta.get('retry_times'):
                proxy = self.get_random_proxy()
                if proxy:
                    uri = 'https://{proxy}'.format(proxy=proxy)
                    self.logger.debug('使用代理 ' + uri)
                    request.meta['proxy'] = uri
    
        @classmethod
        def from_crawler(cls, crawler):
            settings = crawler.settings
            return cls(
                proxy_url=settings.get('PROXY_URL')
            )
    

    下载中间件配置

    DOWNLOADER_MIDDLEWARES = {
        'fangtianxia.middlewares.ProxyMiddleware': 543,
    }
    

    末尾代理池配置

    PROXY_URL = 'http://localhost:5555/random'  # 若为远程服务器运行,需更改地址
    

    对接selenium

    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from scrapy.http import HtmlResponse
    from logging import getLogger
    import time
    
    
    class SeleniumMiddleware():
        # def __init__(self, timeout=None, service_args=[]):
        def __init__(self, timeout=None):
            self.logger = getLogger(__name__)
            self.timeout = timeout
            # self.browser = webdriver.PhantomJS(service_args=service_args)
            self.browser = webdriver.Chrome()
            self.browser.set_window_size(1400, 700)
            self.browser.set_page_load_timeout(self.timeout)
            self.wait = WebDriverWait(self.browser, self.timeout)
    
        def __del__(self):
            self.browser.close()
    
        def process_request(self, request, spider):
            """
            用PhantomJS抓取页面
            :param request: Request对象
            :param spider: Spider对象
            :return: HtmlResponse
            """
            self.logger.debug('PhantomJS is Starting')
            page = request.meta.get('page', 1)
            try:
                self.browser.get(request.url)
                if page > 1:
                    input = self.wait.until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
                    submit = self.wait.until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
                    input.clear()
                    input.send_keys(page)
                    submit.click()
                self.wait.until(
                    EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
                self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
                return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
                                    status=200)
            except TimeoutException:
                return HtmlResponse(url=request.url, status=500, request=request)
    
        @classmethod
        def from_crawler(cls, crawler):
            return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
                       )
            # return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
            #            service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))
    

    Settings配置

    若需要更改为phantomjs,需把注释的内容替换出来,并在setting中增加设置:

    SELENIUM_TIMEOUT = 20
    
    PHANTOMJS_SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
    

    原文链接

  • 相关阅读:
    对单片机存储分配新的认识
    超简单的word转swf 实现
    纯真IP数据库
    webservice 特殊字符处理
    QQ输入法导致win8 x64 装不上vs11,打不开记事本,等各种变态问题
    时间戳转换
    Remote Desktop Organizer 1.4.5
    Xcdoe 4.6 dbank下载
    链接复用
    Visual Studio 2012 序列号
  • 原文地址:https://www.cnblogs.com/ruhai/p/11139244.html
Copyright © 2011-2022 走看看