zoukankan      html  css  js  c++  java
  • scrapy 一些设置和问题

    scrapy设置ua池

    设置后在setting启用

    DOWNLOADER_MIDDLEWARES = {
    'laogou.middlewares.LaogouDownloaderMiddleware': 543,
    'laogou.middlewares.randomUserAgentMiddleware': 400,
    'laogou.middlewares.randomProxyMiddleware': 400,
    }
    from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
    
    class randomUserAgentMiddleware(UserAgentMiddleware):
    
        def __init__(self,user_agent=''):
            self.user_agent = user_agent
    
        def process_request(self, request, spider):
            ua = random.choice(self.user_agent_list)
            if ua:
                request.headers.setdefault('User-Agent', ua)
        user_agent_list = [ 
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" 
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]

    scrapy设置ip池

    from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
    
    class randomHttpProxyMiddleware(HttpProxyMiddleware):
        def __init__(self,ip = ''):
            self.ip = ip
        def process_request(self, request, spider):
            ip = random.choice(self.ip_list)
            if ip:
                request.meta['proxy'] = ip
        ip_list = [
            'https://182.122.176.49:9999',
            'https://125.123.141.20:9999'
        ]

    scrapy 设置自定义cookie:class LaogouwangSpider(scrapy.Spider):    

       name = 'laogouwang'    
       # allowed_domains = ['www.laogou.com']

       # start_urls = ['http://www.laogou.com/'] def start_requests(self): url = 'https://www.lagou.com/' yield scrapy.Request(url=url,callback=self.parse,meta={'cookiejar':1}) def parse(self, response): print(response.request.headers.getlist('Cookie')) print(response.headers.getlist('Set-Cookie')) url = 'https://www.lagou.com/jobs/list_'+ str(settings.keys) +'?city='+ str(settings.cidy) +'&cl=false&fromSearch=true&labelWords=&suginput=' print(response.meta['cookiejar'])
    yield scrapy.Request(url=url,callback=self.download,meta={'cookiejar':response.meta['cookiejar'],'id':1},dont_filter=True)
       def download(self, response):
        # print(response.text)
    print(response.request.headers.getlist('Cookie'))
    print(response.headers.getlist('Set-Cookie'))
    i = response.meta.get('id')
    file = 'false'
    if i == 1:
    file = 'true'
    data = {
    "first":file,
    "pn":str(i),
    "kd":str(settings.keys)
    }
    headers_post = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Content-Length': str(len(urllib.parse.urlencode(data))),
    'Connection': 'keep-alive',
    'Referer':str(response.url),
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
    }
    print(headers_post)
    print(str(response.url))
    print(data)
    url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'

    yield scrapy.FormRequest(url=url,formdata=data,headers=headers_post,callback=self.files,dont_filter=True,meta={'cookiejar':True,'dont_redirect': True,'handle_httpstatus_list': [301,302]})
    meta={'cookiejar':1}这个是启动cookei记录,在后面的请求中使用'cookiejar':response.meta['cookiejar']可以更新cookie。
    注意,需要在setting中设置COOKIES_ENABLED = True

    获取请求cookies是response.request.headers.getlist('Cookie'),响应cookies是response.headers.getlist('Set-Cookie')。
    静止重定向dont_filter=True。
    在meta里使用'dont_redirect': True,'handle_httpstatus_list': [301,302]可以在当前scrapy请求里禁用重定向。

    scrapy 使用日志
    import datetime,os
    time = datetime.datetime.now().strftime('%Y_%m_%H_%M_%S')
    LOG_FILE = 'logs'+ os.sep +str(time) + '_' + "laogou.log"
    LOG_LEVEL = "DEBUG"
    LOG_STDOUT = true

    scrapy提供五种日志级别。

    1.CRITICAL -- 关键错误
    2.ERROR -- 一般级别的错误
    3.WARNING -- 警告信息
    4.INFO -- 信息消息的日志(建议生产模式使用)
    5.DEBUG -- 调试消息的日志(建议开发模式)
    LOG_FILE 用于日志输出记录的文件名 默认None
    LOG_LEVEL 要记录的最低级别 默认DEBUG
    LOG_STDOUT 如果为true 则进程的所有标准输出和错误都重定向到日志,列如print() 默认false

    使用文件启动spider
    #laogoustrart.py

    from
    laogou.spiders.laogouwang import LaogouwangSpider from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings process = CrawlerProcess(get_project_settings()) process.crawl(LaogouwangSpider) process.start()
     




     
  • 相关阅读:
    redis中save和bgsave区别
    scrapy生成json中文为ASCII码解决
    mysql数据库,创建只读用户
    memcached命令行、Memcached数据导出和导入
    Memcache 查看列出所有key方法
    Elasticsearch5.x 引擎健康情况
    docker容器创建MariaDB镜像
    大文本数据排序
    换行符 和回车符
    索引与文本文件
  • 原文地址:https://www.cnblogs.com/dayouzi/p/10390873.html
Copyright © 2011-2022 走看看