zoukankan      html  css  js  c++  java
  • Flask开发系列之Flask+redis实现IP代理池

    Flask开发系列之Flask+redis实现IP代理池

    代理池的要求

    • 多站抓取,异步检测:多站抓取:指的是我们需要从各大免费的ip代理网站,把他们公开的一些免费代理抓取下来;一步检测指的是:把这些代理通过异步请求的方式,利用这些代理请求网站:如果能正常请求就证明代理可用,如果不能正常请求就证明代理不行,这时就可以把这个代理剔除掉,异步指的是:我们不需要一直等待代理请求网站,到得到response之后在执行相应的操作就可以了,异步可以提高检测效率。

    • 定时筛选,持续更新:我们维护一个代理池,我们需要做的是需要定时从里面拿出一部分来检测,剔除掉不可用的代理。这可以保证代理是可用的

    • 提供接口,易于提取:代理实际上是维护在一个队列中,队列可以使用数据库存储,也可以使用一些数据结构来存储,但是如果要获取代理的话,要提供一个简单的接口,最简单的是web形式的接口:本文主要演示一个利用python flask包来提供接口:之后使用python请求网址,从网页中拿到代理的信息了

    代理池的架构

    • 获取器:从各大网站平台抓取代理:ip和端口

    • 过滤器:剔除掉不可用的代理

    • 将可用代理放到代理队列

    • 定时检测器:剔除不可用的代理

    • API:通过接口形式拿到代理对象,方便使用

    测试实现版

    import requests
    import re
    import time
    import redis
    from bloom_filter import BloomFilter
    import ast
    
    pool = redis.ConnectionPool(host='localhost',password='xxx', port=6379, decode_responses=True)
    r = redis.Redis(connection_pool=pool)
    bloombloom = BloomFilter(max_elements=10000, error_rate=0.1)
    bloombloom.add(str({'http': '117.91.232.53:9999'}))
    
    
    def get_ip(i):
        ip_list=[]
        url = 'https://www.kuaidaili.com/free/inha/'
        url = url + str(i + 1)
        html = requests.get(url=url, ).text
        regip = '<td.*?>(d{1,3}.d{1,3}.d{1,3}.d{1,3})</td>.*?<td.*?>(d{1,5})</td>'
        matcher = re.compile(regip, re.S)
        ipstr = re.findall(matcher, html)
        time.sleep(1)
        for j in ipstr:
            ip_list.append(j[0] + ':' + j[1])
        print('共收集到%d个代理ip' % len(ip_list))
        print(ip_list)
        return ip_list
    
    
    
    def valVer(proxys):
        global badNum,goodNum,good_list
        good = []
        for proxy in proxys:
            try:
                proxy_host = proxy
                protocol = 'https' if 'https' in proxy_host else 'http'
                proxies = {protocol: proxy_host}
                print('现在正在测试的IP:', proxies)
                response = requests.get('http://www.baidu.com', proxies=proxies, timeout=2)
                if response.status_code != 200:
                    badNum += 1
                    print(proxy_host, 'bad proxy')
                else:
                    goodNum += 1
                    good.append(proxies)
                    good_list.append(proxies)
                    print(proxy_host, 'success proxy')
            except Exception as e:
                print(e)
                # print proxy_host, 'bad proxy'
                badNum += 1
                continue
        print('success proxy num : ', goodNum)
        print('bad proxy num : ', badNum)
        print("这次:",good)
        print("此时全部:",good_list)
        return good
    
    
    def time_valVer(proxys):
        good = []
        for proxy in proxys:
            try:
                print('现在正在定时测试的IP:',proxy)
                proxy = ast.literal_eval(proxy)
                response = requests.get('http://www.baidu.com', proxies=proxy, timeout=2)
                if response.status_code != 200:
                    r.lrem("ip_list", proxy, 1)
                    print(proxy, 'bad proxy')
                else:
                    good.append(proxy)
                    good_list.append(proxy)
                    print(proxy, 'success proxy')
            except Exception as e:
                print(e)
                continue
    
    def stone(good):
        for IP in good:
            if str(IP) in bloombloom:
                print("%s不能存储,有相同的IP",IP)
                continue
            else:
                print("存储的IP:", IP)
                bloombloom.add(str(IP))
                r.rpush("ip_list", str(IP))
    
    if __name__ == '__main__':
    
        badNum = 0
        goodNum = 0
        good_list = []
        for i in range(0,10):
            if i%10 == 0 and i!=0:
                proxy_list = []
                for i in range(0, r.llen("ip_list")):
                    proxy_list.append(r.lindex("ip_list", i))
                time_valVer(proxy_list)
            else:
                ip_list = get_ip(i)
                good = valVer(ip_list)
                stone(good)
    from flask import Flask
    import redis   # 导入redis模块,通过python操作redis 也可以直接在redis主机的服务端操作缓存数据库
    
    
    r = redis.Redis(host='localhost', port=6379,password='xxx',decode_responses=True)
    app = Flask(__name__)
    @app.route('/ip/<int:index>')
    def reponse(index):
        print(index)
        print(r.lindex("ip_list", index))
        return r.lindex("ip_list", index)
    if __name__ == '__main__':
        app.run(debug=True)

     获取ip:

     改进版

    import requests
    import re
    import time
    import redis
    from bloom_filter import BloomFilter
    import ast
    
    
    pool = redis.ConnectionPool(host='localhost',password='XXX', port=6379, decode_responses=True)
    r = redis.Redis(connection_pool=pool)
    bloombloom = BloomFilter(max_elements=10000, error_rate=0.1)
    
    
    def get_ip(i):
        ip_list=[]
        url = 'https://www.kuaidaili.com/free/inha/'
        url = url + str(i + 1)
        html = requests.get(url=url, ).text
        regip = '<td.*?>(d{1,3}.d{1,3}.d{1,3}.d{1,3})</td>.*?<td.*?>(d{1,5})</td>'
        matcher = re.compile(regip, re.S)
        ipstr = re.findall(matcher, html)
        time.sleep(1)
        for j in ipstr:
            ip_list.append(j[0] + ':' + j[1])
        print('共收集到%d个代理ip' % len(ip_list))
        print(ip_list)
        return ip_list
    
    
    def valVer(proxys):
        global badNum,goodNum,good_list
        good = []
        for proxy in proxys:
            try:
                proxy_host = proxy
                protocol = 'https' if 'https' in proxy_host else 'http'
                proxies = {protocol: proxy_host}
                response = requests.get('http://www.baidu.com', proxies=proxies, timeout=2)
                if response.status_code != 200:
                    badNum += 1
                else:
                    goodNum += 1
                    good.append(proxies)
                    good_list.append(proxies)
            except Exception as e:
                print(e)
                badNum += 1
                continue
        print('success proxy num : ', goodNum)
        print('bad proxy num : ', badNum)
        print("这次:",good)
        print("此时全部:",good_list)
        return good
    
    
    def time_valVer(proxys):
        for proxy in proxys:
            try:
                print('现在正在定时测试的IP:',proxy)
                proxy = ast.literal_eval(proxy)
                response = requests.get('http://www.baidu.com', proxies=proxy, timeout=2)
                if response.status_code != 200:
                    r.lrem("ip_list", proxy, 1)
            except Exception as e:
                print(e)
                continue
    
    def stone_redis(good):
        for IP in good:
            if str(IP) in bloombloom:
                print("%s不能存储,有相同的IP",IP)
                continue
            else:
                print("存储的IP:", IP)
                bloombloom.add(str(IP))
                r.rpush("ip_list", str(IP))
    
    def init():
        for i in range(0, r.llen("ip_list")):
            print(r.lindex("ip_list", i))
            bloombloom.add(r.lindex("ip_list", i))
    
    
    if __name__ == '__main__':
        badNum = 0
        goodNum = 0
        good_list = []
        init()
        for i in range(0,10):
            if i%2 == 0 and i!=0:
                proxy_list = []
                for i in range(0, r.llen("ip_list")):
                    proxy_list.append(r.lindex("ip_list", i))
                time_valVer(proxy_list)
            else:
                ip_list = get_ip(i)
                good = valVer(ip_list)
                stone_redis(good)
    from flask import Flask, abort, request, jsonify
    import redis   # 导入redis模块,通过python操作redis 也可以直接在redis主机的服务端操作缓存数据库
    
    r = redis.Redis(host='localhost', port=6379,password='XXX',decode_responses=True)
    app = Flask(__name__)
    @app.route('/ip/<int:index>', methods=['GET'])
    def reponse(index):
        print(index)
        ip = {"ip":r.lindex("ip_list", index)}
        print(r.lindex("ip_list", index))
        return jsonify(ip)
    if __name__ == '__main__':
        app.run(debug=True)

    获取ip:

  • 相关阅读:
    【XSY3309】Dreamweaver 高斯消元 拉格朗日插值
    【LUOGU???】WD与地图 整体二分 线段树合并
    【CSA49F】【XSY3317】card 博弈论 DP
    【CSA72G】【XSY3316】rectangle 线段树 最小生成树
    【CSA49G】【XSY3315】jump DP
    【集训队作业2018】【XSY3372】取石子 DP
    【LUOGU???】WD与数列 sam 启发式合并
    【LUOGU???】WD与积木 NTT
    【AGC030F】Permutation and Minimum DP
    【AGC030D】Inversion Sum DP
  • 原文地址:https://www.cnblogs.com/-wenli/p/11002902.html
Copyright © 2011-2022 走看看