第一步: 通过 ProxyBroker 获取代理
#Proxy-pool-gather.py
import asyncio
import datetime
import logging
import redis
from proxybroker import Broker
r=redis.Redis(host='localhost',encoding="UTF-8",decode_responses=True)
expire_time_s=60*60*24 #一天后过期
async def save(proxies):
while True:
proxy=await proxies.get()
if proxy is None:
break
if "HTTP" not in proxy.types:
continue
if "High" == proxy.types["HTTP"]:
print(proxy)
row='%s://%s:%d' % ("http",proxy.host,proxy.port)
r.set(row,0,ex=expire_time_s)
while True:
proxies=asyncio.Queue()
broker=Broker(proxies,timeout=2,max_tries=2,grab_timeout=3600)
tasks=asyncio.gather(broker.find(types=["HTTP","HTTPS"]),save(proxies))
loop=asyncio.get_event_loop()
loop.run_until_complete(tasks)
第二步: HTTP服务器展示代理列表: http://0.0.0.0:8000/proxy.json
#Proxy-http-server.py
from flask import Flask
from flask_restful import Resource,Api
import redis
app=Flask(__name__)
api=Api(app)
r=redis.Redis(host="localhost",encoding="UTF-8",decode_responses=True)
class Proxy(Resource):
def get(self):
return r.keys("*")
api.add_resource(Proxy,"/proxy.json")
if __name__ == "__main__":
app.run(host="0.0.0.0",port=8000)
第三步: 结果测试
#Demo.py
import time
import json
import requests
class test_proxy(object):
def __init__(self):
self.pro_url="http://0.0.0.0:8000/proxy.json"
self.test_url="http://httpbin.org/get"
self.headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
def get_proxy(self):
res=requests.get(self.pro_url,self.headers).content.decode()
proxy_list=json.loads(res)
for p in proxy_list:
proxies={"http": str(p.split("://")[1])}
yield proxies
def delete_proxy(self,proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
# def get_test(self,proxies):
# res=requests.get(self.test_url,proxies)
# return res.content.decode("utf-8")
# if res.status_code == 200:
# return "ip有效"
# else:
# return "ip无效"
def get_html(self,proxies):
retry_count = 5
while retry_count > 0:
try:
html = requests.get('https://www.example.com', proxies=proxies)
# 使用代理访问
return html
except Exception:
retry_count -= 1
# 出错5次, 删除代理池中代理
return None
def start(self):
proxy_list=self.get_proxy()
for p in proxy_list:
print(p)
result=self.get_html(p)
if result.status_code==200:
print(result.text)
if __name__=='__main__':
test_proxy().start()
增加国内代理
class Kuaidaili(Provider):
domain="kuaidaili.com"
async def _pipe(self):
urls=["http://www.kuaidaili.com/free/inha/%d" % n for n in range(1,21)]
urls += ["http://www.kuaidaili.com/free/intr/%d" % n for n in range(1,21)]
await self._find_on_pages(urls)
PROVIDERS=[
Kuaidaili(),
]