zoukankan      html  css  js  c++  java
  • requests 使用免费的代理ip爬取网站

    import requests
    import queue
    import threading
    from lxml import etree
    
    #要爬取的URL
    url = "http://xxxxx"
    
    #代理ip网站
    proxy_url = "https://www.kuaidaili.com/free/inha/{page}/"
    
    class MyThreadPool:
        def __init__(self, maxsize):
            self.maxsize = maxsize
            self._pool = queue.Queue(maxsize)
            for _ in range(maxsize):
                self._pool.put(threading.Thread)
    
        def get_thread(self):
            return self._pool.get()
    
        def add_thread(self):
            self._pool.put(threading.Thread)
    
    
    def get_url(url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
                   }
        response = requests.get(url,headers=headers)
        html_str = response.text
        return html_str
    
    
    def proxy_get_url(url,prox):
        proxies = {}
        proxies["http"] = prox
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
                   }
        response = requests.get(url,headers=headers,proxies=proxies,timeout=3)
        html_str = response.text
        return html_str
    
    def ip_proxy(html_str):
        html = etree.HTML(html_str)
        ip_list = html.xpath('//tr/td[@data-title="IP"]/text()')
        port_list = html.xpath('//tr/td[@data-title="PORT"]/text()')
        http_list = []
        for i in range(len(ip_list)):
            http_proxy = ip_list[i]+":"+port_list[i]
            http_list.append(http_proxy)
        return http_list
    
    
    def available_ip(ip_list):
        for ip in ip_list:
            try:
                proxy_get_url('https://www.baidu.com/',ip)
            except Exception as e:
                continue
            IP_LIST.append(ip)
    
    
    
    if __name__ == "__main__":
        IP_LIST = []
        pool = MyThreadPool(20) #线程池数
        #验证代理ip
        for i in range(1,20): #页数
            page_ip = get_url(proxy_url.format(page=i))
            ip_list = ip_proxy(page_ip)
            t = pool.get_thread()
            obj = t(target=available_ip,args=(ip_list,))
            obj.start()
    
        #爬取网站
        for ip in IP_LIST:
            try:
                proxy_get_url(url,ip)
            except Exception as e:
                continue
            print(ip)
    #使用一个ip爬取网站,如果ip不可用了删除ip

    while IP_LIST:
        try:
    print(IP_LIST[0])
    proxy_get_url(url,IP_LIST[0])
    except Exception as e:
    del IP_LIST[0]
    continue

      

  • 相关阅读:
    可持久化线段树学习笔记
    GDI+学习之路
    tcpdump——分析tcp关闭4次过程
    nasm过程调用
    ios学习:NSURLConnection 和 Json数据解析
    ios学习:文件简单读写
    JSONP原理及其简单封装
    JSP使用JSTL
    JDBC
    Apache无法正常启动的原因
  • 原文地址:https://www.cnblogs.com/zhangb8042/p/10036754.html
Copyright © 2011-2022 走看看