zoukankan html css js c++ java

面试题之获取IP地址

#方法一
import re
from lxml import html
import requests
def myRequest(url):
    '''
    封装自己爬取exam页面的request
    :param url: 地址
    :return:
    '''
    response = requests.get(url)
    cookiejar = response.cookies
    while cookiejar is None:
        myRequest(url)
    else:
        cookiejar = cookies(cookiejar)
        response = requests.get('http://datamining.comratings.com/exam3',cookies=cookiejar)
        print(response.text)
        return response.text

def cookie_part(sessionid):
    '''
    通过sessionid解析出新的cookie
    :param sessionid: cookie中的session的value
    :return:
    '''
    encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
    i = 0
    b = ""
    len_ = len(sessionid)
    while i < len_:
        c = ord(sessionid[i]) & 0xff
        i += 1
        if i == len_:
            b += encoderchars[c >> 2]
            b += encoderchars[(c & 0x3) << 4]
            b += "=="
            break
        c2 = ord(sessionid[i])
        i += 1
        if i == len_:
            b += encoderchars[c >> 2]
            b += encoderchars[((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)]
            b += encoderchars[(c2 & 0xf) << 2]
            b += "="
            break
        c3 = ord(sessionid[i])
        i += 1
        b += encoderchars[c >> 2]
        b += encoderchars[((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)]
        b += encoderchars[((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6)]
        b += encoderchars[c3 & 0x3f]
    return b

def cookies(cookiejar):
    '''

    :return:
    '''
    sessionid = cookiejar.get('session')
    cookiejar['c1'] =  cookie_part(sessionid[1:4])
    cookiejar['c2'] = cookie_part(sessionid)
    return cookiejar


def get_ip(html_content):
    '''

    :param html_content:
    :return:
    '''
    html_content = str(html_content).replace('
', '')
    #正则解析style
    ip_regex = re.compile('.(w+){display:none}', re.I)
    style_inline = ip_regex.findall(html_content)
    #拼凑xpath过滤class的模板
    style_pattern = 'and'.join(['@class!=' + '"' + style + '"' for style in style_inline])
    content = html.fromstring(html_content)
    result = content.xpath('//body/text()|//span[@style="display:inline" or ' + style_pattern + ']/text()')
    result.pop(0)
    print(result)
    return result

def ip_format(result):
    '''
    规范爬取的IP数据
    :param result: 爬取的ip的数据列表
    :return:
    '''
    ip_num = []
    print('========该页面10个IP如下========')
    for i in result:
        if i.isdigit():
            ip_num.append(i)
        elif '.' in i and i != '.':
            split_list = str(i).split('.')
            for j in split_list:
                if not j.isdigit():
                    split_list.remove(j)
            ip_num.extend(split_list)

    for ip_part in range(0,len(ip_num),4):
        print('.'.join(ip_num[ip_part:ip_part+4]))




if __name__ == '__main__':
    html_content = myRequest('http://datamining.comratings.com/exam')
    result = get_ip(html_content)
    ip_format(result)

#方法 2
#!/usr/bin/env python
# -*- coding: utf-8 –*-
import re

import requests
from lxml import etree


# 移植javascript
def f1(a):
    encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
    len_str = len(a)
    i = 0
    b = ""
    while i < len_str:
        c = ord(a[i]) & 0xff
        i += 1
        if i == len_str:
            b += encoderchars[c >> 2]
            b += encoderchars[(c & 0x3) << 4]
            b += "=="
            break
        c2 = ord(a[i])
        i += 1
        if i == len_str:
            b += encoderchars[c >> 2]
            b += encoderchars[(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4))]
            b += encoderchars[((c2 & 0xf) << 2)]
            b += "="
            break
        c3 = ord(a[i])
        b += encoderchars[c >> 2]
        b += encoderchars[(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4))]
        b += encoderchars[(((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6))]
        b += encoderchars[c3 & 0x3f]
        i += 1

    return b


# headers
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0"}

# 实例session对象
sess = requests.session()

# 获取cookies中的session
get_cookies_url = "http://datamining.comratings.com/exam"
response = sess.get(get_cookies_url, headers=headers)
session_id = response.cookies.get_dict()['session']

# 请求头加入cookie
res_cookies = 'session={}; c1={}; c2={}; path=/'.format(session_id, f1(session_id[1:4]), f1(session_id))
headers['Cookie'] = res_cookies

# 获取到抓取ip的网页
get_ip_url = "http://datamining.comratings.com/exam3"
html_str = sess.get(get_ip_url, headers=headers).text

# xpath拿出style标签, 拿出其中class=none的值
html = etree.HTML(html_str)
style = html.xpath('//style')[0].text
inlines = [i[1:5] for i in style.split('
') if len(i) > 0 and i[-5:-1] == 'none']
# print(inlines)

# 按照'<br>'切割整个页面, 去除index为0含表头的信息
html_lines = [i.splitlines() for i in html_str.split('<br>')[1:]]
# print(html_lines)

# 根据下标分组ip
result = {}
regex = re.compile('d+')
for i in html_lines:
    value = []
    for ip_item in i:
        # 去除所有无用信息后匹配到所有的数字
        if 'none' not in ip_item and inlines[0] not in ip_item and 
                inlines[1] not in ip_item and '.' not in ip_item and len(ip_item) > 0:
            item.extend(regex.findall(j))
    # 加入分组 {index:[x,x,x,x], }
    result[html_lines.index(i)] = value

# 组合ip
ips = []
for i in result.values():
    ip = i[0] + '.' + i[1] + '.' + i[2] + '.' + i[3]
    ips.append(ip)

print(ips)

#方法三
import re
import requests

import execjs
import lxml.html


class IP:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
        }
        self.s = requests.session()

    def first(self):
        first_url = "http://datamining.comratings.com/exam"
        response1 = self.s.get(url=first_url, headers=self.headers)
        sessionid = response1.cookies.get_dict().get('session')
        return sessionid

    def make_cookie(self, sessionid):
        with open('exam.js', 'r', encoding='utf8') as fp:
            data = fp.read()
            # print(type(data))
        data_temp = execjs.compile(data)
        cookie = data_temp.call('reload', sessionid)
        cookie = cookie.split(' ')
        return cookie

    def second(self, cookie):
        second_url = "http://datamining.comratings.com/exam3"
        cookies = {"Cookie": ''.join(cookie)}
        print(cookies)
        response2 = self.s.get(url=second_url, headers=self.headers, cookies=cookies)
        with open('exam2.html', 'w', encoding='utf8') as fp:
            fp.write(response2.content.decode('utf8'))
        return response2

    def filter(self, html):
        pattern = re.compile(r'.([A-Z]+){display:none}')
        class_none_list = pattern.findall(html.text)
        pattern_class_none1 = re.compile('<spansclass="' + class_none_list[0] + '">.*</span>')
        first_filter = pattern_class_none1.sub("", html.text)
        pattern_class_none2 = re.compile('<spansclass="' + class_none_list[1] + '">.*</span>')
        second_filter = pattern_class_none2.sub("", first_filter)
        pattern_span_none = re.compile('<spansstyle="display:none">.*?</span>')
        third_filter = pattern_span_none.sub("", second_filter)
        pattern_div = re.compile('<divs.*')
        fourth = pattern_div.sub("", third_filter)
        with open('finish.html', 'w', encoding='utf8') as fp:
            # fp.write(fourth.replace('
', '').replace('	', '').replace('
', ''))
            fp.write(fourth.replace('	', '').replace('
', ''))
        html = lxml.html.fromstring(fourth.replace("
", ""))
        # 当前节点及其所有后代
        # html_data = html.xpath('//body/descendant-or-self::text()')
        html_data = html.xpath('//body//text()')
        # print(html_data)
        ip = []
        ip_temp = ""
        for i in html_data[1:]:
            if ip_temp.count('.') == 3 and ip_temp[-1] != '.':
                ip.append(ip_temp)
                ip_temp = ""
            ip_temp += i
            if i == html_data[-1]:
                ip.append(ip_temp)
        print(ip)
        print(len(ip))

    def run(self):
        sessionid = self.first()
        cookie = self.make_cookie(sessionid)
        html = self.second(cookie)
        self.filter(html)


if __name__ == "__main__":
    ip = IP()
    ip.run()
--------------------------------------------------------------------------

execjs文件


function f1(a) {
    var encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
    var b, i, len;
    var c, c2, c3;
    len = a.length;
    i = 0;
    b = "";
    while (i < len) {
        c = a.charCodeAt(i++) & 0xff;
        if (i == len) {
            b += encoderchars.charAt(c >> 2);
            b += encoderchars.charAt((c & 0x3) << 4);
            b += "==";
            break
        }
        c2 = a.charCodeAt(i++);
        if (i == len) {
            b += encoderchars.charAt(c >> 2);
            b += encoderchars.charAt(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4));
            b += encoderchars.charAt((c2 & 0xf) << 2);
            b += "=";
            break
        }
        c3 = a.charCodeAt(i++);
        b += encoderchars.charAt(c >> 2);
        b += encoderchars.charAt(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4));
        b += encoderchars.charAt(((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6));
        b += encoderchars.charAt(c3 & 0x3f)
    }
    return b
}

function reload(session) {
    var c1, c2 ;
    c1 = "c1=" + f1(session.substr(1, 3))+';';
    c2 = "c2=" + f1(session);
    return 'session='+session+';'+' '+c1+' '+c2
}

查看全文

相关阅读:
Winform界面开发：如何在代码中获取自定义外观元素属性的值
 VCL组件DevExpress VCL发布v20.1.4，附高速下载
 WPF界面开发技巧分享——如何实现自定义DateEdit并自动更正值
 Web开发实用技能，看Kendo UI for jQuery组模板如何使用
 docker从C盘迁移到D盘
 ubuntu16.04中开启和关闭防火墙命令
 Linux安装与卸载 docker-compose
在Docker容器bash中输入中文
 in()和exists()
mysql遇见Expression #1 of SELECT list is not in GROUP BY clause and contains nonaggre的问题

原文地址：https://www.cnblogs.com/liangliangzz/p/10175989.html