zoukankan      html  css  js  c++  java
  • scrapy 解决爬虫IP代理池,数据轻松爬。

    现在越来越多的人在工作中使用到爬虫,各个网站的反爬虫机制也越来越严格,下面就自己构建一个代理ip池。

    手动更新ip池

    1.1在setting配置文件中新增ip池

    IPPOOL=[
    
    {“ipaddr”:”61.129.70.131:8080”},
    
    {“ipaddr”:”61.152.81.193:9100”},
    
    {“ipaddr”:”120.204.85.29:3128”},
    
    {“ipaddr”:”219.228.126.86:8123”},
    
    {“ipaddr”:”61.152.81.193:9100”},
    
    {“ipaddr”:”218.82.33.225:53853”},
    
    {“ipaddr”:”223.167.190.17:42789”}
    
    ]

    1.2修改middlewares.py文件

    import random  
    from scrapy import signals  
    from youx.settings import IPPOOL  
    
    class MyproxiesSpiderMiddleware(object):  
    
          def __init__(self,ip=''):  
              self.ip=ip  
    
          def process_request(self, request, spider):  
              thisip=random.choice(IPPOOL)  
              print("this is ip:"+thisip["ipaddr"])  
              request.meta["proxy"]="http://"+thisip["ipaddr"] 

    1.3在setting里面配置DOWNLOADER_MIDDLEWARES

     DOWNLOADER_MIDDLEWARES = {
            #  'youx.middlewares.MyCustomDownloaderMiddleware': 543,
            'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': None,
            'youx.middlewares.MyproxiesSpiderMiddleware': 125
        } 

    二.直接在middlewares.py文件里面添加ip池

    2.1middlewares文件里面代码

    import base64
    
    import random
    
    from scrapy import signals
    
    PROXIES = [
    
    {‘ip_port’: ‘61.160.233.8’, ‘user_pass’: ”},
    
    {‘ip_port’: ‘125.93.149.186’, ‘user_pass’: ”},
    
    {‘ip_port’: ‘58.38.86.181’, ‘user_pass’: ”},
    
    {‘ip_port’: ‘119.142.86.110’, ‘user_pass’: ”},
    
    {‘ip_port’: ‘124.161.16.89’, ‘user_pass’: ”},
    
    {‘ip_port’: ‘61.160.233.8’, ‘user_pass’: ”},
    
    {‘ip_port’: ‘101.94.131.237’, ‘user_pass’: ”},
    
    {‘ip_port’: ‘219.157.162.97’, ‘user_pass’: ”},
    
    {‘ip_port’: ‘61.152.89.18’, ‘user_pass’: ”},
    
    {‘ip_port’: ‘139.224.132.192’, ‘user_pass’: ”}
    
    ]
    
     class ProxyMiddleware(object):
            def process_request(self, request, spider):
                proxy = random.choice(PROXIES)
                if proxy['user_pass'] is not None:
                    request.meta['proxy'] = "http://%s" % proxy['ip_port']
                    encoded_user_pass = base64.encodestring(proxy['user_pass'])
                    request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
                else:
                    request.meta['proxy'] = "http://%s" % proxy['ip_port']

    2.2setting里面代码

    DOWNLOADER_MIDDLEWARES = {
            # 'youx.middlewares.MyCustomDownloaderMiddleware': 543,
            'youx.middlewares.ProxyMiddleware': 700,
            'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': None,
        }

    三.自动更新IP池

    3.1这里写个自动获取IP的类proxies.py,执行一下把获取的IP保存到txt文件中去:

    # *-* coding:utf-8 *-*  
        import requests  
        from bs4 import BeautifulSoup  
        import lxml  
        from multiprocessing import Process, Queue  
        import random  
        import json  
        import time  
        import requests  
    
    class Proxies(object):  
    
        """docstring for Proxies"""  
        def __init__(self, page=3):  
            self.proxies = []  
            self.verify_pro = []  
            self.page = page  
            self.headers = {  
            'Accept': '*/*',  
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)   Chrome/45.0.2454.101 Safari/537.36',  
            'Accept-Encoding': 'gzip, deflate, sdch',  
            'Accept-Language': 'zh-CN,zh;q=0.8'  
            }  
            self.get_proxies()  
            self.get_proxies_nn()  
    
        def get_proxies(self):  
            page = random.randint(1,10)  
            page_stop = page + self.page  
            while page < page_stop:  
                url = 'http://www.xicidaili.com/nt/%d' % page  
                html = requests.get(url, headers=self.headers).content  
                soup = BeautifulSoup(html, 'lxml')  
                ip_list = soup.find(id='ip_list')  
                for odd in ip_list.find_all(class_='odd'):  
                    protocol = odd.find_all('td')[5].get_text().lower()+'://'  
                    self.proxies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))  
                page += 1  
    
        def get_proxies_nn(self):  
            page = random.randint(1,10)  
            page_stop = page + self.page  
            while page < page_stop:  
                url = 'http://www.xicidaili.com/nn/%d' % page  
                html = requests.get(url, headers=self.headers).content  
                soup = BeautifulSoup(html, 'lxml')  
                ip_list = soup.find(id='ip_list')  
                for odd in ip_list.find_all(class_='odd'):  
                    protocol = odd.find_all('td')[5].get_text().lower() + '://'  
                    self.proxies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))  
                page += 1  
    
        def verify_proxies(self):  
            # 没验证的代理  
            old_queue = Queue()  
            # 验证后的代理  
            new_queue = Queue()  
            print ('verify proxy........')  
            works = []  
            for _ in range(15):  
                 works.append(Process(target=self.verify_one_proxy, args=(old_queue,new_queue)))  
            for work in works:  
                work.start()  
            for proxy in self.proxies:  
                old_queue.put(proxy)  
            for work in works:  
                old_queue.put(0)  
            for work in works:  
                work.join()  
            self.proxies = []  
            while 1:  
                try:  
                    self.proxies.append(new_queue.get(timeout=1))  
                except:  
                    break  
            print ('verify_proxies done!')  
    
    
        def verify_one_proxy(self, old_queue, new_queue):  
            while 1:  
                proxy = old_queue.get()  
                if proxy == 0:break  
                protocol = 'https' if 'https' in proxy else 'http'  
                proxies = {protocol: proxy}  
                try:  
                    if requests.get('http://www.baidu.com', proxies=proxies, timeout=2).status_code == 200:  
                        print ('success %s' % proxy)  
                        new_queue.put(proxy)  
                  except:  
                    print ('fail %s' % proxy)  
    
    
    if __name__ == '__main__':  
        a = Proxies()  
        a.verify_proxies()  
        print (a.proxies)  
        proxie = a.proxies   
        with open('proxies.txt', 'a') as f:  
            for proxy in proxie:  
                 f.write(proxy+'
    ')  
  • 相关阅读:
    垃圾回收相关概念
    垃圾回收相关算法
    垃圾回收概述
    StringTable
    执行引擎
    [前端]背景图,中间放大特效
    [Javascript]类数组对象为什么不能用for in进行遍历
    [前端] 画个圈圈显示百分比
    win10 Build 14905.rs_prerelease.160811-1739 填坑记录
    [翻译][10 By 10 外文博客] 01.uwp获得关注并安装
  • 原文地址:https://www.cnblogs.com/jiguangdongtaiip/p/13518923.html
Copyright © 2011-2022 走看看