zoukankan      html  css  js  c++  java
  • python3 爬取代理池

    import  re
    import requests
    from bs4 import BeautifulSoup as bs
    import _thread
    import time
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Upgrade-Insecure-Requests':'1'
    }
    def iplist():
        ip_port = [];
        for x in range(1,30):
            url=r'http://www.66ip.cn/{}.html'.format(x)
            html=requests.get(url=url,headers=headers)
            html=html.text.encode("latin1").decode('gbk') #中文乱码解决
            re_ip=re.findall("((?:[0-9]{1,3}.){3}[0-9]{1,3})",html) # ip正则匹配
            re_port=re.findall("<td>(d{1,5})</td>",html) #port正则匹配
            result=dict(zip(re_ip,re_port)) #列表合成字典
            for key in result: #字典遍历
                ip_port.append(key+":"+result[key])
        return ip_port
    
    def iplist2():#https://www.89ip.cn/
        a=[]
        url='https://www.89ip.cn/tqdl.html?api=1&num=100'
        iplist2get=requests.get(url=url,headers=headers).text
        re_ip2=re.findall('((?:[0-9]{1,3}.){3}[0-9]{1,3}.*?d{1,5})',iplist2get)
        for x in re_ip2:
            a.append(x)
        return a
    
    def ver(ips):
        for ip in ips:
            proxies={
                'http':'http://{}'.format(ip),
                'https':'https://{}'.format(ip)
            }
            url='http://www.baidu.com'
            try:
                res=_thread.start_new_thread(requests.get(url=url,proxies=proxies,timeout=0))
                if res.status_code == 200:
                    print('Success:'+ip)
                    with open('Success.txt','a+') as f:
                        f.write(ip+'
    ')
            except Exception as e:
                print ('Failed:'+ip)
    
    
    def result():
        ips=iplist2()
        ver(ips)
        ips=iplist()
        ver(ips)
    
    
    if __name__=='__main__':
        start=time.clock()
        result()
        end=time.clock()
        print ('time: %s'%(end-start))
    
    安安静静变优秀。 --胖丫
  • 相关阅读:
    获取网络上的北京时间,如果大于设定的过期时间就...
    MYSQL注释
    mysql的perror
    Spring + CXF(REST):webservice not found
    vim 学习笔记
    mysql存储过程controller的not found造成混乱的解决办法
    pt-query-digest 安装及使用
    MYSQL预处理传参不区分大小写解决办法
    解压版mysql安装--windows系统
    sql plus 和 pl/sql无法连接远程oracle数据库
  • 原文地址:https://www.cnblogs.com/pangya/p/15061666.html
Copyright © 2011-2022 走看看