import telnetlib
import urllib.request
from bs4 import BeautifulSoup
for d in range(1, 3): # 采集1到2页
scrapeUrl = 'http://www.xicidaili.com/nn/%d/' % d
req = urllib.request.Request(scrapeUrl)
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
response = urllib.request.urlopen(req)
html = response.read()
bsObj = BeautifulSoup(html, "html.parser")
for i in range(100):
speed = float(bsObj.select('td')[6 + i * 10].div.get('title').replace('秒', ''))
if speed < 0.2: # 验证速度,只要速度在0.2秒之内的
ip = bsObj.select('td')[1 + i * 10].get_text()
port = bsObj.select('td')[2 + i * 10].get_text()
ip_address = 'http://' + ip + ':' + port
try:
telnetlib.Telnet(ip, port=port, timeout=2) # 用telnet对ip进行验证
except:
print('fail')
else:
print('sucess:' + ip_address)
f = open('proxy_list.txt', 'a')
f.write(ip_address + '
')
f.close()
版本二:
import threading
import time
import json
import telnetlib
class TestProxy(object):
def __init__(self):
today = time.strftime('%Y%m%d', time.localtime())
self.filename = today + '.txt'
self.sFile = self.filename
self.dFile = r'alive.txt'
self.URL = r'http://www.baidu.com'
self.threads = 10
self.timeout = 3
self.aliveList = []
self.run()
def run(self):
with open(self.sFile, 'r',encoding='utf-8') as f:
lines = f.readlines()
line = lines.pop()
line = json.loads(line)
while lines:
for i in range(self.threads):
t = threading.Thread(target=self.linkWithProxy, args=(line,))
t.start()
if lines:
line = lines.pop()
else:
continue
with open(self.dFile, 'w') as f:
for i in range(len(self.aliveList)):
f.write(self.aliveList[i] + '
')
def linkWithProxy(self, line):
line = json.loads(line)
protocol = line['protocol'].lower()
ip = line['ip']
port = line['port']
server = protocol + '://' + line['ip'] + ':' + line['port']
print(server)
try:
response = telnetlib.Telnet(ip, port=port, timeout=self.timeout)
except:
print('%s 链接失败' % server)
return
else:
print('%s 链接成功!' % server)
self.aliveList.append(server)
print(self.aliveList)
if __name__ == '__main__':
TP = TestProxy()