zoukankan      html  css  js  c++  java
  • 采集代理ip 地址【西刺,快代理】

    # 嗯,。。。因为经常需要使用代理去抓一点东西,就有了下面一段代码,第一版不是很好,后面用到了再来优化

    import
    re,pymysql,time,redis from urllib.request import Request from urllib.request import urlopen headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } url = 'http://www.xicidaili.com/wt/' db = pymysql.connect(host='127.0.0.1', user='root', password="liu", database='test', port=3306, charset='utf8') cur = db.cursor() def url_response(url,cur): response = urlopen(Request(url,headers=headers)).read() response = response.decode() pattern='<td>(.*?)</td>s+<td>(d+)</td>s+<td>s+<a href="/.*?">[u4e00-u9fa5]+</a>s+</td>s+<td class="country">高匿</td>s+<td>(w+)</td>s+<td class="country">s+<div title="(d.d+)秒"' regex = re.compile(pattern) ip_list = regex.findall(response) for i in ip_list: out_time = float(i[3]) ip_ = i[0]+':'+i[1] sql = 'select ip_ from ip where ip_ = "%s"'%ip_ cur.execute(sql) if cur.fetchone(): print('重复数据跳过') continue if out_time < 1: sql = "insert into ip(ip_,time_,xy_) values('%s','%s','%s')"%(ip_,out_time,i[2]) cur.execute(sql) print('插入成功,',i) else: pass for i in range(1,3): _ = url+str(i) url_response(_,cur) db.commit() time.sleep(2)
    # 第一版使用 mysql存着也没啥用处 ,然后就给来了redis
    # 加入线程池的使用,让抓取更加速度
    
    import re,time,redis
    from concurrent.futures import ThreadPoolExecutor
    from urllib.request import Request
    from urllib.request import urlopen
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }
    url = 'http://www.xicidaili.com/wt/'
    class R(object):
        def __init__(self):
            r_pool =redis.ConnectionPool(host='127.0.0.1',db=0,password=None,
                                         port=6379)
            self.redis_obj = redis.Redis(connection_pool=r_pool)
        def setex(self,name,value,time):
            return self.redis_obj.setex(name,value,time)
    
        def get(self,name):
            return self.redis_obj.get(name)
    
    def url_response(url,redis_obj):
        response = urlopen(Request(url,headers=headers)).read()
        response = response.decode()
        pattern='<td>(.*?)</td>s+<td>(d+)</td>s+<td>s+<a href="/.*?">[u4e00-u9fa5]+</a>s+</td>s+<td class="country">高匿</td>s+<td>(w+)</td>s+<td class="country">s+<div title="(d.d+)秒"'
        regex = re.compile(pattern)
        ip_list = regex.findall(response)
        for i in ip_list:
            out_time = float(i[3])
            ip_ = i[0]+':'+i[1]
            if redis_obj.get(ip_):
                print('重复数据跳过')
                continue
            if out_time < 1:
                redis_obj.setex(ip_,1,60*30)
                print('插入成功,',ip_)
            else:
                pass
    r = R()
    T = ThreadPoolExecutor(4)
    for i in range(1,5):
        _ = url+str(i)
        T.submit(url_response,_,r)
    print('执行完成 ')
    T.shutdown()
    更新第二版 ,使用redis___西刺


    2018-12-17:

      第二版出错更新: 15行 :return self.redis_obj.setex(name,time,value)  #此处已改正

    # python 3.7
    
    from lxml import etree
    import requests,time,redis
    
    class Kuai_IP(object):
        def __init__(self):
            self.headers = {
                'Host': 'www.kuaidaili.com',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
                'Referer': 'https://www.kuaidaili.com/free/inha/1/'
                }
            self.static='https://www.kuaidaili.com/free/inha/%s/'
            Con_pool = redis.ConnectionPool(host='127.0.0.1')
            self.r = redis.Redis(connection_pool=Con_pool)
    
        def getPage(self,page_index):
            if page_index==1:
                self.headers['Referer']='https://www.kuaidaili.com/free/inha/'
            else:
                self.headers['Referer']='https://www.kuaidaili.com/free/inha/'+str(page_index-1)+'/'
            res = requests.get(url=self.static%page_index,headers=self.headers)
            self.parse(res.text)
    
        def parse(self,res):
            html = etree.HTML(res)
            r_list = html.xpath('//tbody/tr/td/text()')
            if r_list:
                for i in range(1, int(len(r_list) / 7)):
                    _ =r_list[i * 7:(i + 1) * 7]
                    self.r.setex(_[3]+'://'+_[0]+':'+_[1],30,_[3])
                    print(_)
            else:
                print(r_list)
    
        def work_on(self):
            page_index = 2  # 爬取页数
            for i in range(1,page_index+1):
                self.getPage(i)
                print(i, '---------')
                time.sleep(2)
    
    if __name__ == '__main__':
        ip = Kuai_IP()
        ip.work_on()
    View Code______快代理IP——没做过滤


    2018-12-20

      嗯,这次由于用到的代理比较多,就把西刺和快代理的代码合到了一起,没做什么大的改进,

      1 代理ip格式全部成为 requests代理的形式{'http':'xxx://xx.xx.xx.xx:xxx'} 方便requests的调用

     

    # -*- coding:utf-8 -*-
    # @time:2018-12-20 22:23
    
    
    import re,redis,time,requests
    from concurrent.futures import ThreadPoolExecutor
    from urllib.request import Request
    from urllib.request import urlopen
    
    page = 10 #定义抓取页数,由于快代理一页的ip比较少,然后我就在此的基础上+10
    
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }
    url = 'http://www.xicidaili.com/wt/'
    class R(object):
        def __init__(self):
            r_pool =redis.ConnectionPool(host='127.0.0.1',db=0,password=None,
                                         port=6379)
            self.redis_obj = redis.Redis(connection_pool=r_pool)
        def setex(self,name,value,time):
            return self.redis_obj.setex(name, value, time)
    
        def get(self,name):
            return self.redis_obj.get(name)
    
    def url_response(url,redis_obj):
        response = urlopen(Request(url,headers=headers)).read()
        response = response.decode()
        pattern='<td>(.*?)</td>s+<td>(d+)</td>s+<td>s+<a href="/.*?">[u4e00-u9fa5]+</a>s+</td>s+<td class="country">高匿</td>s+<td>(w+)</td>s+<td class="country">s+<div title="(d.d+)秒"'
        regex = re.compile(pattern)
        ip_list = regex.findall(response)
        for i in ip_list:
            out_time = float(i[3])
            ip_ = i[0]+':'+i[1]
            if redis_obj.get(ip_):
                print('重复数据跳过')
                continue
            if out_time < 1:
                ip_ = "HTTP://"+str(ip_)
                redis_obj.setex(ip_,1,60*30*20)
                print('插入成功,',ip_)
            else:
                pass
    
    
    from lxml import etree
    class Kuai_IP(object):
        def __init__(self):
            self.headers = {
                'Host': 'www.kuaidaili.com',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
                'Referer': 'https://www.kuaidaili.com/free/inha/1/'
                }
            self.static='https://www.kuaidaili.com/free/inha/%s/'
            Con_pool = redis.ConnectionPool(host='127.0.0.1',port=6379,db=0)
            self.r = redis.Redis(connection_pool=Con_pool)
    
        def getPage(self,page_index):
            if page_index==1:
                self.headers['Referer']='https://www.kuaidaili.com/free/inha/'
            else:
                self.headers['Referer']='https://www.kuaidaili.com/free/inha/'+str(page_index-1)+'/'
            res = requests.get(url=self.static%page_index,headers=self.headers)
            self.parse(res.text)
    
        def parse(self,res):
            html = etree.HTML(res)
            r_list = html.xpath('//tbody/tr/td/text()')
            if r_list:
                for i in range(1, int(len(r_list) / 7)):
                    _ =r_list[i * 7:(i + 1) * 7]
                    self.r.setex(_[3]+'://'+_[0]+':'+_[1],_[3],30*60*24)
                    print(_)
            else:
                print(r_list)
    
        def work_on(self):
            page_index = page+10  # 爬取页数
            for i in range(1,page_index+1):
                self.getPage(i)
                print(i, '---------')
                time.sleep(2)
    
    if __name__ == '__main__':
        r = R()
        T = ThreadPoolExecutor(4)
        for i in range(1,page):
            _ = url+str(i)
            T.submit(url_response,_,r)
        print('执行完成 ')
        T.shutdown()
        ip = Kuai_IP()
        ip.work_on()
    View Code---第三版融合,总共将近500个ip,应该是够用的


    2018-12-24

      嗯,上面代理在80行出现错误,因为快代理的这个超时时间有时出现的是数字+文字,然后在设置缓存的时候就

      出现了【 value is not an integer or out of range】

    # -*- coding:utf-8 -*-
    # @time:2018-12-18 22:23
    # @Auther:1043453579@qq.com
    
    import re,redis,time,requests
    from concurrent.futures import ThreadPoolExecutor
    from urllib.request import Request
    from urllib.request import urlopen
    
    page = 10
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }
    url = 'http://www.xicidaili.com/wt/'
    class R(object):
        def __init__(self):
            r_pool =redis.ConnectionPool(host='127.0.0.1',db=0,password=None,
                                         port=6379)
            self.redis_obj = redis.Redis(connection_pool=r_pool)
        def setex(self,name,value,time):
            return self.redis_obj.setex(name, time, value)
    
        def get(self,name):
            return self.redis_obj.get(name)
    
    def url_response(url,redis_obj):
        response = urlopen(Request(url,headers=headers)).read()
        response = response.decode()
        pattern='<td>(.*?)</td>s+<td>(d+)</td>s+<td>s+<a href="/.*?">[u4e00-u9fa5]+</a>s+</td>s+<td class="country">高匿</td>s+<td>(w+)</td>s+<td class="country">s+<div title="(d.d+)秒"'
        regex = re.compile(pattern)
        ip_list = regex.findall(response)
        for i in ip_list:
            out_time = float(i[3])
            ip_ = i[0]+':'+i[1]
            if redis_obj.get(ip_):
                print('重复数据跳过')
                continue
            if out_time < 1:
                ip_ = "HTTP://"+str(ip_)
                redis_obj.setex(ip_,1,60*30*20)
                print('插入成功,',ip_)
            else:
                pass
    r = R()
    T = ThreadPoolExecutor(4)
    for i in range(1,page):
        _ = url+str(i)
        T.submit(url_response,_,r)
    print('执行完成 ')
    T.shutdown()
    
    
    from lxml import etree
    class Kuai_IP(object):
        def __init__(self):
            self.headers = {
                'Host': 'www.kuaidaili.com',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
                'Referer': 'https://www.kuaidaili.com/free/inha/1/'
                }
            self.static='https://www.kuaidaili.com/free/inha/%s/'
            Con_pool = redis.ConnectionPool(host='127.0.0.1',port=6379,db=0)
            self.r = redis.Redis(connection_pool=Con_pool)
    
        def getPage(self,page_index):
            if page_index==1:
                self.headers['Referer']='https://www.kuaidaili.com/free/inha/'
            else:
                self.headers['Referer']='https://www.kuaidaili.com/free/inha/'+str(page_index-1)+'/'
            res = requests.get(url=self.static%page_index,headers=self.headers)
            self.parse(res.text)
    
        def parse(self,res):
            html = etree.HTML(res)
            r_list = html.xpath('//tbody/tr/td/text()')
            if r_list:
                for i in range(1, int(len(r_list) / 7)):
                    _ =r_list[i * 7:(i + 1) * 7]
                    print('1',_)
                    #exit()
                    self.r.setex(_[3]+'://'+_[0]+':'+_[1],30*60*24,30*60*24)
                    # print(_)
            else:
                print(r_list)
    
        def work_on(self):
            page_index = page+10  # 爬取页数
            for i in range(1,page_index+1):
                self.getPage(i)
                print(i, '---------')
                time.sleep(2)
    
    ip = Kuai_IP()
    ip.work_on()
    View Code---第四版融合,将近500+ip,应该是够用的
  • 相关阅读:
    linux curses函数库
    在Android library中不能使用switch-case语句访问资源ID的原因分析及解决方案
    Android Support ;v4、v7、v13的区别
    background-position
    java web 之 web.xml篇
    javaweb之Cookie篇
    Enumeration 接口
    Java Bad version number in .class file
    使用AppCan自带的升级功能实现移动端升级
    obj.offsetHeight与obj.style.height区别
  • 原文地址:https://www.cnblogs.com/Skyda/p/9706315.html
Copyright © 2011-2022 走看看