import requests
from bs4 import BeautifulSoup
requests.packages.urllib3.disable_warnings()
from pytesseract import image_to_string
from string import ascii_lowercase
from random import choice
url = 'https://proxy.mimvp.com/free.php?proxy=in_socks'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
content = requests.get(url,headers=headers,verify=False).text
soup = BeautifulSoup(content,'lxml')
ips = soup.select('.tbl-proxy-ip')
proxyType = soup.select('.tbl-proxy-type')
ports = soup.select('.tbl-proxy-port')
def getIP():
ipAll = []
for ipHtml in ips[1:]:
ip = ipHtml.contents
ipAll.append(ip[0])
return ipAll
def getPort():
portAll = []
hostUrl = 'https://proxy.mimvp.com/'
for portHtml in ports:
try:
port = portHtml.img['src']
completUrl = hostUrl+port
imgBin = requests.get(completUrl).content
imgFile = ''.join(choice(ascii_lowercase) for _ in range(5))+'.jpg'
imgFileHandler = open(imgFile,'wb')
imgFileHandler.write(imgBin)
imgFileHandler.close()
testdata_dir_config = '--tessdata-dir "C:\Program Files (x86)\Tesseract-OCR\tessdata"'
verifyCode = image_to_string(imgFile,lang='eng',config=testdata_dir_config)
portAll.append(verifyCode)
except:
pass
return portAll
def getProxyType():
proxyTypes = []
for typeHtml in proxyType:
types = typeHtml.contents
proxyTypes.append(types)
return proxyTypes
allPort = getPort()
allIp = getIP()
types = getProxyType()
for i in range(len(allIp)):
print allIp[i]+':'+allPort[i]+','+types[i]
(1)需要安装:Tesseract-OCR;
(2)为Tesseract-OCR设置环境变量
识别的时候有20%的可能将port识别为字母,也就是20%的出错率,因为代码中的代理网站的端口是图片形式提供,OCR识别会有一定出错。