zoukankan      html  css  js  c++  java
  • [未完]构建代理池

    第一步: 通过 ProxyBroker 获取代理

    #Proxy-pool-gather.py
    
    import asyncio
    import datetime
    import logging
    import redis
    from proxybroker import Broker
    
    r=redis.Redis(host='localhost',encoding="UTF-8",decode_responses=True)
    
    expire_time_s=60*60*24 #一天后过期
    
    async def save(proxies):
        while True:
            proxy=await proxies.get()
            if proxy is None:
                break
            if "HTTP" not in proxy.types:
                continue
            if "High" == proxy.types["HTTP"]:
                print(proxy)
                row='%s://%s:%d' % ("http",proxy.host,proxy.port)
                r.set(row,0,ex=expire_time_s)
    
    while True:
        proxies=asyncio.Queue()
        broker=Broker(proxies,timeout=2,max_tries=2,grab_timeout=3600)
        tasks=asyncio.gather(broker.find(types=["HTTP","HTTPS"]),save(proxies))
        loop=asyncio.get_event_loop()
        loop.run_until_complete(tasks)
    
    

    第二步: HTTP服务器展示代理列表: http://0.0.0.0:8000/proxy.json

    #Proxy-http-server.py
    
    from flask import Flask
    from flask_restful import Resource,Api
    import redis
    
    app=Flask(__name__)
    api=Api(app)
    
    r=redis.Redis(host="localhost",encoding="UTF-8",decode_responses=True)
    
    class Proxy(Resource):
        def get(self):
            return r.keys("*")
    
    api.add_resource(Proxy,"/proxy.json")
    
    if __name__ == "__main__":
        app.run(host="0.0.0.0",port=8000)
    
    

    第三步: 结果测试

    #Demo.py
    
    import time
    import json
    import requests
    
    class test_proxy(object):
        def __init__(self):
            self.pro_url="http://0.0.0.0:8000/proxy.json"
            self.test_url="http://httpbin.org/get"
            self.headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
    
        def get_proxy(self):
            res=requests.get(self.pro_url,self.headers).content.decode()
            proxy_list=json.loads(res)
            for p in proxy_list:
                proxies={"http": str(p.split("://")[1])}
                yield proxies
        
        def delete_proxy(self,proxy):
            requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
    
        # def get_test(self,proxies):
        #     res=requests.get(self.test_url,proxies)
        #     return res.content.decode("utf-8")
            # if res.status_code == 200:
            #     return "ip有效"
            # else:
            #     return "ip无效"
    
        def get_html(self,proxies):
            retry_count = 5
            while retry_count > 0:
                try:
                    html = requests.get('https://www.example.com', proxies=proxies)
                    # 使用代理访问
                    return html
                except Exception:
                    retry_count -= 1
            # 出错5次, 删除代理池中代理
            return None
    
        def start(self):
            proxy_list=self.get_proxy()
            for p in proxy_list:
                print(p)
                result=self.get_html(p)
                if result.status_code==200:
                    print(result.text)
    
    if __name__=='__main__':
        test_proxy().start()
    
    

    增加国内代理

    class Kuaidaili(Provider):
        domain="kuaidaili.com"
    
        async def _pipe(self):
            urls=["http://www.kuaidaili.com/free/inha/%d" % n for n in range(1,21)]
            urls += ["http://www.kuaidaili.com/free/intr/%d" % n for n in range(1,21)]
            await self._find_on_pages(urls)
    
    PROVIDERS=[
        Kuaidaili(),
    ]
    
    
  • 相关阅读:
    json转换字符串
    windows下Xshell远程访问虚拟机
    win7去箭头指令
    n核CPU为什么计算速度达不到单核n倍
    vim字符串的替换
    转发的别人的vim编码和终端编码的设置
    音频操作
    scanf函数
    文字常量区和栈区区别
    Linux 进程
  • 原文地址:https://www.cnblogs.com/hankleo/p/11747180.html
Copyright © 2011-2022 走看看