zoukankan html css js c++ java

python获取免费代理

import requests
from bs4 import BeautifulSoup
requests.packages.urllib3.disable_warnings()
from pytesseract import image_to_string
from string import ascii_lowercase
from random import choice

url = 'https://proxy.mimvp.com/free.php?proxy=in_socks'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
content = requests.get(url,headers=headers,verify=False).text
soup = BeautifulSoup(content,'lxml')
ips = soup.select('.tbl-proxy-ip')
proxyType = soup.select('.tbl-proxy-type')
ports = soup.select('.tbl-proxy-port')

def getIP():
    ipAll = []
    for ipHtml in ips[1:]:
        ip = ipHtml.contents
        ipAll.append(ip[0])
    return ipAll

def getPort():
    portAll = []
    hostUrl = 'https://proxy.mimvp.com/'
    for portHtml in ports:
        try:
            port = portHtml.img['src']
            completUrl = hostUrl+port
            imgBin = requests.get(completUrl).content
            imgFile = ''.join(choice(ascii_lowercase) for _ in range(5))+'.jpg'
            imgFileHandler = open(imgFile,'wb')
            imgFileHandler.write(imgBin)
            imgFileHandler.close()

            testdata_dir_config = '--tessdata-dir "C:\Program Files (x86)\Tesseract-OCR\tessdata"'
            verifyCode = image_to_string(imgFile,lang='eng',config=testdata_dir_config)
            portAll.append(verifyCode)
        except:
            pass
    return portAll

def getProxyType():
    proxyTypes = []
    for typeHtml in proxyType:
        types = typeHtml.contents
        proxyTypes.append(types)
    return proxyTypes

allPort = getPort()
allIp = getIP()
types = getProxyType()
for i in range(len(allIp)):
    print allIp[i]+':'+allPort[i]+','+types[i]

　　（1）需要安装：Tesseract-OCR；

　　（2）为Tesseract-OCR设置环境变量

识别的时候有20%的可能将port识别为字母，也就是20%的出错率，因为代码中的代理网站的端口是图片形式提供，OCR识别会有一定出错。

查看全文

相关阅读:
用图片来代替字符串
 下载网页时的有gzip压缩的处理
 位置不固定验证码的识别
 CookieContainer 与 Session
Thread Pool 备忘
 用 SGMLReader把子HTML 转 XML
非asp.net控件实现回发 button
ajax.net ??= 回车
 如何写需求分析
 jsp中地址

原文地址：https://www.cnblogs.com/websec/p/9364906.html