zoukankan      html  css  js  c++  java
  • 面试题之获取IP地址

    #方法一
    import re
    from lxml import html import requests def myRequest(url): ''' 封装自己爬取exam页面的request :param url: 地址 :return: ''' response = requests.get(url) cookiejar = response.cookies while cookiejar is None: myRequest(url) else: cookiejar = cookies(cookiejar) response = requests.get('http://datamining.comratings.com/exam3',cookies=cookiejar) print(response.text) return response.text def cookie_part(sessionid): ''' 通过sessionid解析出新的cookie :param sessionid: cookie中的session的value :return: ''' encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" i = 0 b = "" len_ = len(sessionid) while i < len_: c = ord(sessionid[i]) & 0xff i += 1 if i == len_: b += encoderchars[c >> 2] b += encoderchars[(c & 0x3) << 4] b += "==" break c2 = ord(sessionid[i]) i += 1 if i == len_: b += encoderchars[c >> 2] b += encoderchars[((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)] b += encoderchars[(c2 & 0xf) << 2] b += "=" break c3 = ord(sessionid[i]) i += 1 b += encoderchars[c >> 2] b += encoderchars[((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)] b += encoderchars[((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6)] b += encoderchars[c3 & 0x3f] return b def cookies(cookiejar): ''' :return: ''' sessionid = cookiejar.get('session') cookiejar['c1'] = cookie_part(sessionid[1:4]) cookiejar['c2'] = cookie_part(sessionid) return cookiejar def get_ip(html_content): ''' :param html_content: :return: ''' html_content = str(html_content).replace(' ', '') #正则解析style ip_regex = re.compile('.(w+){display:none}', re.I) style_inline = ip_regex.findall(html_content) #拼凑xpath过滤class的模板 style_pattern = 'and'.join(['@class!=' + '"' + style + '"' for style in style_inline]) content = html.fromstring(html_content) result = content.xpath('//body/text()|//span[@style="display:inline" or ' + style_pattern + ']/text()') result.pop(0) print(result) return result def ip_format(result): ''' 规范爬取的IP数据 :param result: 爬取的ip的数据列表 :return: ''' ip_num = [] print('========该页面10个IP如下========') for i in result: if i.isdigit(): ip_num.append(i) elif '.' in i and i != '.': split_list = str(i).split('.') for j in split_list: if not j.isdigit(): split_list.remove(j) ip_num.extend(split_list) for ip_part in range(0,len(ip_num),4): print('.'.join(ip_num[ip_part:ip_part+4])) if __name__ == '__main__': html_content = myRequest('http://datamining.comratings.com/exam') result = get_ip(html_content) ip_format(result)
    #方法 2
    #!/usr/bin/env python # -*- coding: utf-8 –*- import re import requests from lxml import etree # 移植javascript def f1(a): encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" len_str = len(a) i = 0 b = "" while i < len_str: c = ord(a[i]) & 0xff i += 1 if i == len_str: b += encoderchars[c >> 2] b += encoderchars[(c & 0x3) << 4] b += "==" break c2 = ord(a[i]) i += 1 if i == len_str: b += encoderchars[c >> 2] b += encoderchars[(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4))] b += encoderchars[((c2 & 0xf) << 2)] b += "=" break c3 = ord(a[i]) b += encoderchars[c >> 2] b += encoderchars[(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4))] b += encoderchars[(((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6))] b += encoderchars[c3 & 0x3f] i += 1 return b # headers headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0"} # 实例session对象 sess = requests.session() # 获取cookies中的session get_cookies_url = "http://datamining.comratings.com/exam" response = sess.get(get_cookies_url, headers=headers) session_id = response.cookies.get_dict()['session'] # 请求头加入cookie res_cookies = 'session={}; c1={}; c2={}; path=/'.format(session_id, f1(session_id[1:4]), f1(session_id)) headers['Cookie'] = res_cookies # 获取到抓取ip的网页 get_ip_url = "http://datamining.comratings.com/exam3" html_str = sess.get(get_ip_url, headers=headers).text # xpath拿出style标签, 拿出其中class=none的值 html = etree.HTML(html_str) style = html.xpath('//style')[0].text inlines = [i[1:5] for i in style.split(' ') if len(i) > 0 and i[-5:-1] == 'none'] # print(inlines) # 按照'<br>'切割整个页面, 去除index为0含表头的信息 html_lines = [i.splitlines() for i in html_str.split('<br>')[1:]] # print(html_lines) # 根据下标分组ip result = {} regex = re.compile('d+') for i in html_lines: value = [] for ip_item in i: # 去除所有无用信息后匹配到所有的数字 if 'none' not in ip_item and inlines[0] not in ip_item and inlines[1] not in ip_item and '.' not in ip_item and len(ip_item) > 0: item.extend(regex.findall(j)) # 加入分组 {index:[x,x,x,x], } result[html_lines.index(i)] = value # 组合ip ips = [] for i in result.values(): ip = i[0] + '.' + i[1] + '.' + i[2] + '.' + i[3] ips.append(ip) print(ips)
    #方法三
    import re
    import requests
    
    import execjs
    import lxml.html
    
    
    class IP:
        def __init__(self):
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
            }
            self.s = requests.session()
    
        def first(self):
            first_url = "http://datamining.comratings.com/exam"
            response1 = self.s.get(url=first_url, headers=self.headers)
            sessionid = response1.cookies.get_dict().get('session')
            return sessionid
    
        def make_cookie(self, sessionid):
            with open('exam.js', 'r', encoding='utf8') as fp:
                data = fp.read()
                # print(type(data))
            data_temp = execjs.compile(data)
            cookie = data_temp.call('reload', sessionid)
            cookie = cookie.split(' ')
            return cookie
    
        def second(self, cookie):
            second_url = "http://datamining.comratings.com/exam3"
            cookies = {"Cookie": ''.join(cookie)}
            print(cookies)
            response2 = self.s.get(url=second_url, headers=self.headers, cookies=cookies)
            with open('exam2.html', 'w', encoding='utf8') as fp:
                fp.write(response2.content.decode('utf8'))
            return response2
    
        def filter(self, html):
            pattern = re.compile(r'.([A-Z]+){display:none}')
            class_none_list = pattern.findall(html.text)
            pattern_class_none1 = re.compile('<spansclass="' + class_none_list[0] + '">.*</span>')
            first_filter = pattern_class_none1.sub("", html.text)
            pattern_class_none2 = re.compile('<spansclass="' + class_none_list[1] + '">.*</span>')
            second_filter = pattern_class_none2.sub("", first_filter)
            pattern_span_none = re.compile('<spansstyle="display:none">.*?</span>')
            third_filter = pattern_span_none.sub("", second_filter)
            pattern_div = re.compile('<divs.*')
            fourth = pattern_div.sub("", third_filter)
            with open('finish.html', 'w', encoding='utf8') as fp:
                # fp.write(fourth.replace('
    ', '').replace('	', '').replace('
    ', ''))
                fp.write(fourth.replace('	', '').replace('
    ', ''))
            html = lxml.html.fromstring(fourth.replace("
    ", ""))
            # 当前节点及其所有后代
            # html_data = html.xpath('//body/descendant-or-self::text()')
            html_data = html.xpath('//body//text()')
            # print(html_data)
            ip = []
            ip_temp = ""
            for i in html_data[1:]:
                if ip_temp.count('.') == 3 and ip_temp[-1] != '.':
                    ip.append(ip_temp)
                    ip_temp = ""
                ip_temp += i
                if i == html_data[-1]:
                    ip.append(ip_temp)
            print(ip)
            print(len(ip))
    
        def run(self):
            sessionid = self.first()
            cookie = self.make_cookie(sessionid)
            html = self.second(cookie)
            self.filter(html)
    
    
    if __name__ == "__main__":
        ip = IP()
        ip.run()
    --------------------------------------------------------------------------
    execjs文件

    function f1(a) {
    var encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="; var b, i, len; var c, c2, c3; len = a.length; i = 0; b = ""; while (i < len) { c = a.charCodeAt(i++) & 0xff; if (i == len) { b += encoderchars.charAt(c >> 2); b += encoderchars.charAt((c & 0x3) << 4); b += "=="; break } c2 = a.charCodeAt(i++); if (i == len) { b += encoderchars.charAt(c >> 2); b += encoderchars.charAt(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)); b += encoderchars.charAt((c2 & 0xf) << 2); b += "="; break } c3 = a.charCodeAt(i++); b += encoderchars.charAt(c >> 2); b += encoderchars.charAt(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)); b += encoderchars.charAt(((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6)); b += encoderchars.charAt(c3 & 0x3f) } return b } function reload(session) { var c1, c2 ; c1 = "c1=" + f1(session.substr(1, 3))+';'; c2 = "c2=" + f1(session); return 'session='+session+';'+' '+c1+' '+c2 }
  • 相关阅读:
    redis缓存问题解决方案
    JVM 2-垃圾收集及内存分配策略
    JVM 1-内存管理
    mysql数据库基础
    事务学习
    使用redis分布式锁来解决集群项目的定时任务冲突问题
    Spring AOP学习
    Spring IOC
    java语言基础7--线程相关类
    多重背包的二进制优化——DP
  • 原文地址:https://www.cnblogs.com/liangliangzz/p/10175989.html
Copyright © 2011-2022 走看看