zoukankan      html  css  js  c++  java
  • scrapy常用配置

    设置随机请求头

    # 安装 pip install fake-useragent
    # middleware.py
    from fake_useragent import UserAgent
    class RandomUserAgentMiddlware(object):
        #随机更换user-agent
        def __init__(self,crawler):
            super(RandomUserAgentMiddlware,self).__init__()
            self.ua = UserAgent()
    
        @classmethod
        def from_crawler(cls,crawler):
            return cls(crawler)
    
        def process_request(self,request,spider):
            request.headers.setdefault("User-Agent",self.ua.random)
    # ...
    # setting.py 中启动RandomUserAgentMiddlware
    DOWNLOADER_MIDDLEWARES = {
        'yourProjectName.middlewares.RandomUserAgentMiddlware': 312,
    }
    

    设置代理

    # 这里使用的是阿布云隧道代理
    # middleware.py
    import base64
    
    # 代理服务器
    proxyServer = "http://http-dyn.abuyun.com:9020"
    
    # 代理隧道验证信息
    proxyUser = "your proxyUser"
    proxyPass = "your proxyPass"
    
    proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8")
    class ProxyMiddleware(object):
        def process_request(self, request, spider):
            request.meta["proxy"] = proxyServer
            request.headers["Proxy-Authorization"] = proxyAuth
    
    # ...
    # setting.py 中启动RandomUserAgentMiddlware        
    DOWNLOADER_MIDDLEWARES = {
         'yourProjectName.middlewares.ProxyMiddleware': 100,
    }
    
    

    图片下载

    # pipelines.py
    from scrapy import Request
    from scrapy.pipelines.images import ImagesPipeline
    class ImagePipeline(ImagesPipeline):
        headers = {
    		"""如果网站有headers检测就加上"""
        }
    
        def get_media_requests(self, item, info):
            # 循环每一张图片地址下载,若传过来的不是集合则无需循环直接yield
            for image_url in item['imgurl']:
                headers = self.headers
                headers['Referer'] = item['from_url']
                # meta里面的数据是从spider获取,然后通过meta传递给下面方法:file_path
                yield Request(image_url,headers=headers,meta={'foldername': item['foldername'], 'imgname': item['imgname']})
        # 重命名,若不重写这函数,图片名为哈希
        def file_path(self, request, response=None, info=None):
            pic_format = request.url.split(".")[-1]
            imgname = request.meta['imgname']
            # 接收meta传递过来的图集名称
            foldername = request.meta['foldername']
            # 过滤windows字符串
            foldername = re.sub(r'[?\*|“<>:/]', '', foldername)
            filename = fr'{foldername}/{imgname}.{pic_format}'
            return filename
       
    # ...
    # setting.py 中启动ImagePipeline       
    DOWNLOADER_MIDDLEWARES = {
         'yourProjectName.pipelines.ImagePipeline': 200,
    }
    

    异步写入MySQL

    # pipelines.py
    from yourProjectName.settings import MYSQL_DBNAME, MYSQL_HOST,MYSQL_PASSWORD,MYSQL_PORT,MYSQL_USER
    
    
    class MysqlTwistedPipeline(object):
        """
        异步写入mysql
        """
        def __init__(self, dbpool):
            self.dbpool = dbpool
    
        @classmethod
        def from_settings(cls,setting):
            dbparms = dict(
                host=MYSQL_HOST,
                db=MYSQL_DBNAME,
                user=MYSQL_USER,
                passwd=MYSQL_PASSWORD,
                port=MYSQL_PORT,
                charset='utf8',
                cursorclass=pymysql.cursors.DictCursor,
                use_unicode=False,
            )
            dbpool = adbapi.ConnectionPool("pymysql", **dbparms)
            return cls(dbpool)
    
        def process_item(self, item, spider):
            query = self.dbpool.runInteraction(self.do_insert, item)
            query.addCallback(self.handle_error)
    
        def handle_error(self, failure):
            print("MysqlTwistedPipeline error is :",failure)
    
        def do_insert(self, cursor, item):
            insert_sql = """insert_sql"""
            cursor.execute(insert_sql, ())#有些情况需使用pymysql.escape_string()对item字段进行转义
    
    # ...
    # setting.py 中启动ImagePipeline       
    DOWNLOADER_MIDDLEWARES = {
         'yourProjectName.pipelines.MysqlTwistedPipeline': 200,
    }
    
  • 相关阅读:
    css中关于div中文本垂直居中的问题。
    点击弹出列表内容
    html+css+js实现滑动导航条(转载)
    在事件触发的时候,有时我们需要一些模拟用户行为的操作。例如:当网页加载完毕后 自行点击一个按钮触发一个事件,而不是用户去点击。
    活动倒计时案例
    javascript操作css实现弹出对话框
    PHP在不同页面之间传值的三种常见方式
    实现form表单提交到服务器,并且在将表单内容返回到该页面
    简单的百度预测搜索功能(php+jQuery+js+ajax)
    Eclipse4.6的一些基本操作及环境搭配
  • 原文地址:https://www.cnblogs.com/zhangxuel1ang/p/13174463.html
Copyright © 2011-2022 走看看