在中间件middlewares中写入一个类,然后再setting中的DOWNLOADER_MIDDLEWARES = {}开启一下
具体代码是
from scrapy.http import HtmlResponse
ip_pool = []
pro_addr = ''
class proxyMiddleware(object):
def process_request(self, request, spider):
global pro_addr,ip_pool
if "jdzgb" in spider.name:
while 1:
if len(ip_pool) < 3:
get_ip_url = "http://d.jghttp.golangapi.com/getipxxxxxx" #获取ip的url
ips = requests.get(get_ip_url).text.split('
')
for i in ips[:-1]:
ip_pool.append(i.strip())
break
else:
break
if not pro_addr:
pro_addr = random.choice(ip_pool)
while 1:
url = 'https://www.baidu.com'
proxies = {
"http": pro_addr,
}
try:
s = requests.session()
s.keep_alive = False # 关闭多余连接
response = s.get(url=url,proxies=proxies,timeout=4, verify=False)
code = response.status_code
# res = requests.get(url, proxies=proxies,timeout=4)
# code = res.status_code
except Exception as e:
print(e)
code = '0'
print(code,pro_addr)
# print(1, ip_pool)
if code == 200 or code == 304:
request.meta['proxy'] = "http://" + pro_addr
#pro_addr = random.choice(ip_pool) #这里的意思是每次访问的ip都不一样,如果把这里关闭,那么就是一个ip如果不过期,就会一直使用这个ip
break
else:
if pro_addr in ip_pool:
ip_pool.remove(pro_addr)
while 1:
if len(ip_pool) < 3:
get_ip_url = "http://d.jghttp.golangapi.com/getipxxxxxxx"#获取ip的url
ips = requests.get(get_ip_url).text.split('
')
for i in ips[:-1]:
ip_pool.append(i.strip())
break
else:
break
pro_addr = random.choice(ip_pool)