import requests from bs4 import BeautifulSoup requests.packages.urllib3.disable_warnings() from pytesseract import image_to_string from string import ascii_lowercase from random import choice url = 'https://proxy.mimvp.com/free.php?proxy=in_socks' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } content = requests.get(url,headers=headers,verify=False).text soup = BeautifulSoup(content,'lxml') ips = soup.select('.tbl-proxy-ip') proxyType = soup.select('.tbl-proxy-type') ports = soup.select('.tbl-proxy-port') def getIP(): ipAll = [] for ipHtml in ips[1:]: ip = ipHtml.contents ipAll.append(ip[0]) return ipAll def getPort(): portAll = [] hostUrl = 'https://proxy.mimvp.com/' for portHtml in ports: try: port = portHtml.img['src'] completUrl = hostUrl+port imgBin = requests.get(completUrl).content imgFile = ''.join(choice(ascii_lowercase) for _ in range(5))+'.jpg' imgFileHandler = open(imgFile,'wb') imgFileHandler.write(imgBin) imgFileHandler.close() testdata_dir_config = '--tessdata-dir "C:\Program Files (x86)\Tesseract-OCR\tessdata"' verifyCode = image_to_string(imgFile,lang='eng',config=testdata_dir_config) portAll.append(verifyCode) except: pass return portAll def getProxyType(): proxyTypes = [] for typeHtml in proxyType: types = typeHtml.contents proxyTypes.append(types) return proxyTypes allPort = getPort() allIp = getIP() types = getProxyType() for i in range(len(allIp)): print allIp[i]+':'+allPort[i]+','+types[i]
(1)需要安装:Tesseract-OCR;
(2)为Tesseract-OCR设置环境变量
识别的时候有20%的可能将port识别为字母,也就是20%的出错率,因为代码中的代理网站的端口是图片形式提供,OCR识别会有一定出错。