zoukankan      html  css  js  c++  java
  • python获取免费代理

    import requests
    from bs4 import BeautifulSoup
    requests.packages.urllib3.disable_warnings()
    from pytesseract import image_to_string
    from string import ascii_lowercase
    from random import choice
    
    url = 'https://proxy.mimvp.com/free.php?proxy=in_socks'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    content = requests.get(url,headers=headers,verify=False).text
    soup = BeautifulSoup(content,'lxml')
    ips = soup.select('.tbl-proxy-ip')
    proxyType = soup.select('.tbl-proxy-type')
    ports = soup.select('.tbl-proxy-port')
    
    def getIP():
        ipAll = []
        for ipHtml in ips[1:]:
            ip = ipHtml.contents
            ipAll.append(ip[0])
        return ipAll
    
    def getPort():
        portAll = []
        hostUrl = 'https://proxy.mimvp.com/'
        for portHtml in ports:
            try:
                port = portHtml.img['src']
                completUrl = hostUrl+port
                imgBin = requests.get(completUrl).content
                imgFile = ''.join(choice(ascii_lowercase) for _ in range(5))+'.jpg'
                imgFileHandler = open(imgFile,'wb')
                imgFileHandler.write(imgBin)
                imgFileHandler.close()
    
                testdata_dir_config = '--tessdata-dir "C:\Program Files (x86)\Tesseract-OCR\tessdata"'
                verifyCode = image_to_string(imgFile,lang='eng',config=testdata_dir_config)
                portAll.append(verifyCode)
            except:
                pass
        return portAll
    
    def getProxyType():
        proxyTypes = []
        for typeHtml in proxyType:
            types = typeHtml.contents
            proxyTypes.append(types)
        return proxyTypes
    
    allPort = getPort()
    allIp = getIP()
    types = getProxyType()
    for i in range(len(allIp)):
        print allIp[i]+':'+allPort[i]+','+types[i]
    

      (1)需要安装:Tesseract-OCR;

      (2)为Tesseract-OCR设置环境变量

    识别的时候有20%的可能将port识别为字母,也就是20%的出错率,因为代码中的代理网站的端口是图片形式提供,OCR识别会有一定出错。

  • 相关阅读:
    练习jQuery
    Highcharts的应用步骤
    CSS中的数量查询
    何时使用 Em 与 Rem
    不错的教学网站
    HTML5中新增的语义化标签,及在IE5.5~9(IE9已经开始支持部分HTML5新标签了)支持这些新标签的兼容性处理。
    【洛谷P4139】上帝与集合的正确用法
    【洛谷P1357】花园
    【洛谷P1939】矩阵加速(数列)
    【洛谷P1962】斐波那契数列
  • 原文地址:https://www.cnblogs.com/websec/p/9364906.html
Copyright © 2011-2022 走看看