#方法一
import re
from lxml import html
import requests
def myRequest(url):
'''
封装自己爬取exam页面的request
:param url: 地址
:return:
'''
response = requests.get(url)
cookiejar = response.cookies
while cookiejar is None:
myRequest(url)
else:
cookiejar = cookies(cookiejar)
response = requests.get('http://datamining.comratings.com/exam3',cookies=cookiejar)
print(response.text)
return response.text
def cookie_part(sessionid):
'''
通过sessionid解析出新的cookie
:param sessionid: cookie中的session的value
:return:
'''
encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
i = 0
b = ""
len_ = len(sessionid)
while i < len_:
c = ord(sessionid[i]) & 0xff
i += 1
if i == len_:
b += encoderchars[c >> 2]
b += encoderchars[(c & 0x3) << 4]
b += "=="
break
c2 = ord(sessionid[i])
i += 1
if i == len_:
b += encoderchars[c >> 2]
b += encoderchars[((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)]
b += encoderchars[(c2 & 0xf) << 2]
b += "="
break
c3 = ord(sessionid[i])
i += 1
b += encoderchars[c >> 2]
b += encoderchars[((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)]
b += encoderchars[((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6)]
b += encoderchars[c3 & 0x3f]
return b
def cookies(cookiejar):
'''
:return:
'''
sessionid = cookiejar.get('session')
cookiejar['c1'] = cookie_part(sessionid[1:4])
cookiejar['c2'] = cookie_part(sessionid)
return cookiejar
def get_ip(html_content):
'''
:param html_content:
:return:
'''
html_content = str(html_content).replace('
', '')
#正则解析style
ip_regex = re.compile('.(w+){display:none}', re.I)
style_inline = ip_regex.findall(html_content)
#拼凑xpath过滤class的模板
style_pattern = 'and'.join(['@class!=' + '"' + style + '"' for style in style_inline])
content = html.fromstring(html_content)
result = content.xpath('//body/text()|//span[@style="display:inline" or ' + style_pattern + ']/text()')
result.pop(0)
print(result)
return result
def ip_format(result):
'''
规范爬取的IP数据
:param result: 爬取的ip的数据列表
:return:
'''
ip_num = []
print('========该页面10个IP如下========')
for i in result:
if i.isdigit():
ip_num.append(i)
elif '.' in i and i != '.':
split_list = str(i).split('.')
for j in split_list:
if not j.isdigit():
split_list.remove(j)
ip_num.extend(split_list)
for ip_part in range(0,len(ip_num),4):
print('.'.join(ip_num[ip_part:ip_part+4]))
if __name__ == '__main__':
html_content = myRequest('http://datamining.comratings.com/exam')
result = get_ip(html_content)
ip_format(result)
#方法 2
#!/usr/bin/env python
# -*- coding: utf-8 –*-
import re
import requests
from lxml import etree
# 移植javascript
def f1(a):
encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
len_str = len(a)
i = 0
b = ""
while i < len_str:
c = ord(a[i]) & 0xff
i += 1
if i == len_str:
b += encoderchars[c >> 2]
b += encoderchars[(c & 0x3) << 4]
b += "=="
break
c2 = ord(a[i])
i += 1
if i == len_str:
b += encoderchars[c >> 2]
b += encoderchars[(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4))]
b += encoderchars[((c2 & 0xf) << 2)]
b += "="
break
c3 = ord(a[i])
b += encoderchars[c >> 2]
b += encoderchars[(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4))]
b += encoderchars[(((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6))]
b += encoderchars[c3 & 0x3f]
i += 1
return b
# headers
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0"}
# 实例session对象
sess = requests.session()
# 获取cookies中的session
get_cookies_url = "http://datamining.comratings.com/exam"
response = sess.get(get_cookies_url, headers=headers)
session_id = response.cookies.get_dict()['session']
# 请求头加入cookie
res_cookies = 'session={}; c1={}; c2={}; path=/'.format(session_id, f1(session_id[1:4]), f1(session_id))
headers['Cookie'] = res_cookies
# 获取到抓取ip的网页
get_ip_url = "http://datamining.comratings.com/exam3"
html_str = sess.get(get_ip_url, headers=headers).text
# xpath拿出style标签, 拿出其中class=none的值
html = etree.HTML(html_str)
style = html.xpath('//style')[0].text
inlines = [i[1:5] for i in style.split('
') if len(i) > 0 and i[-5:-1] == 'none']
# print(inlines)
# 按照'<br>'切割整个页面, 去除index为0含表头的信息
html_lines = [i.splitlines() for i in html_str.split('<br>')[1:]]
# print(html_lines)
# 根据下标分组ip
result = {}
regex = re.compile('d+')
for i in html_lines:
value = []
for ip_item in i:
# 去除所有无用信息后匹配到所有的数字
if 'none' not in ip_item and inlines[0] not in ip_item and
inlines[1] not in ip_item and '.' not in ip_item and len(ip_item) > 0:
item.extend(regex.findall(j))
# 加入分组 {index:[x,x,x,x], }
result[html_lines.index(i)] = value
# 组合ip
ips = []
for i in result.values():
ip = i[0] + '.' + i[1] + '.' + i[2] + '.' + i[3]
ips.append(ip)
print(ips)
#方法三
import re
import requests
import execjs
import lxml.html
class IP:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
self.s = requests.session()
def first(self):
first_url = "http://datamining.comratings.com/exam"
response1 = self.s.get(url=first_url, headers=self.headers)
sessionid = response1.cookies.get_dict().get('session')
return sessionid
def make_cookie(self, sessionid):
with open('exam.js', 'r', encoding='utf8') as fp:
data = fp.read()
# print(type(data))
data_temp = execjs.compile(data)
cookie = data_temp.call('reload', sessionid)
cookie = cookie.split(' ')
return cookie
def second(self, cookie):
second_url = "http://datamining.comratings.com/exam3"
cookies = {"Cookie": ''.join(cookie)}
print(cookies)
response2 = self.s.get(url=second_url, headers=self.headers, cookies=cookies)
with open('exam2.html', 'w', encoding='utf8') as fp:
fp.write(response2.content.decode('utf8'))
return response2
def filter(self, html):
pattern = re.compile(r'.([A-Z]+){display:none}')
class_none_list = pattern.findall(html.text)
pattern_class_none1 = re.compile('<spansclass="' + class_none_list[0] + '">.*</span>')
first_filter = pattern_class_none1.sub("", html.text)
pattern_class_none2 = re.compile('<spansclass="' + class_none_list[1] + '">.*</span>')
second_filter = pattern_class_none2.sub("", first_filter)
pattern_span_none = re.compile('<spansstyle="display:none">.*?</span>')
third_filter = pattern_span_none.sub("", second_filter)
pattern_div = re.compile('<divs.*')
fourth = pattern_div.sub("", third_filter)
with open('finish.html', 'w', encoding='utf8') as fp:
# fp.write(fourth.replace('
', '').replace(' ', '').replace('
', ''))
fp.write(fourth.replace(' ', '').replace('
', ''))
html = lxml.html.fromstring(fourth.replace("
", ""))
# 当前节点及其所有后代
# html_data = html.xpath('//body/descendant-or-self::text()')
html_data = html.xpath('//body//text()')
# print(html_data)
ip = []
ip_temp = ""
for i in html_data[1:]:
if ip_temp.count('.') == 3 and ip_temp[-1] != '.':
ip.append(ip_temp)
ip_temp = ""
ip_temp += i
if i == html_data[-1]:
ip.append(ip_temp)
print(ip)
print(len(ip))
def run(self):
sessionid = self.first()
cookie = self.make_cookie(sessionid)
html = self.second(cookie)
self.filter(html)
if __name__ == "__main__":
ip = IP()
ip.run()
--------------------------------------------------------------------------
execjs文件
function f1(a) {
var encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
var b, i, len;
var c, c2, c3;
len = a.length;
i = 0;
b = "";
while (i < len) {
c = a.charCodeAt(i++) & 0xff;
if (i == len) {
b += encoderchars.charAt(c >> 2);
b += encoderchars.charAt((c & 0x3) << 4);
b += "==";
break
}
c2 = a.charCodeAt(i++);
if (i == len) {
b += encoderchars.charAt(c >> 2);
b += encoderchars.charAt(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4));
b += encoderchars.charAt((c2 & 0xf) << 2);
b += "=";
break
}
c3 = a.charCodeAt(i++);
b += encoderchars.charAt(c >> 2);
b += encoderchars.charAt(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4));
b += encoderchars.charAt(((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6));
b += encoderchars.charAt(c3 & 0x3f)
}
return b
}
function reload(session) {
var c1, c2 ;
c1 = "c1=" + f1(session.substr(1, 3))+';';
c2 = "c2=" + f1(session);
return 'session='+session+';'+' '+c1+' '+c2
}