一、数字反爬
乱码的原因:style中加载了字体文件,这字体文件中包含着加密方法 大致思路: 1、请求页面 2、获取加密的字体库 3、解析字体库,获取字体间的映射关系 4、获取加密的字体,获取字体间映射关系,一一对应
import requests import re import base64 from fontTools.ttLib import TTFont from lxml import etree import asyncio import aiohttp import time headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36", "Cookie": "footprints=eyJpdiI6IkhNbHRoM0lTdWdYWnJIcW9PZ1E0dkE9PSIsInZhbHVlIjoiWG5SZDVwWkpsekxNdERIRjZQeHgyY1JzWVFxWGpVMUFYMjV2NlNHVFBBaVRcL3F5akNYYU5nc0RNM2VzTUN2YWYiLCJtYWMiOiI2ODEyNTg5NjYxMDBkNzZjNzMwMWE0ZTkwM2FlZWU3MzVlNTNmMzE2ZGUzNzRiMzM2NmNlODg2NmNhOWMzOGRmIn0%3D; _ga=GA1.2.907203747.1604565923; _gid=GA1.2.1910542827.1604565923; __gads=ID=e84726596f3c4e4d-2245fc118dc4006a:T=1604565923:RT=1604565923:S=ALNI_MbZHQMLD_QOArjXJ8cXuhVtK_C-Zw; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1604565922,1604567153; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IlRBSTlUZDNNM0kyRE40ZlF2T2xjQWc9PSIsInZhbHVlIjoiK0dCRTBTRUN1dWNJcjZPclNHemwrYnhKQUlHRWhyWUJpZjJQb0JDZEliZXNpeFwvYjM1Y2VTdXl5c2xaaHlmWmkiLCJtYWMiOiIzMWIwNGQ2MWYyNDUwNWIwYzFmY2RjZmQ0NGYyOGNkYmRmMDBhMzg4YWVlNGRiOWE2MWNkMDZkZmEzNzg2NTk3In0%3D; glidedsky_session=eyJpdiI6InRuMm44ZlwvemRxdmJ1dEJpVXdpbSt3PT0iLCJ2YWx1ZSI6ImowQnVmeUx5NGRZMUxIcERJSHRSMW84bExib09rRzNhbDZFMERwYXYrRW9cL3JlTVljK0c5M05CSzJGN21YandZIiwibWFjIjoiOWUwZGRiZGI1OTBlMjZlMDY1MTAzNWVmOTI5Yjg5NWFhYmFmMTdjODdlYTg4ZDc2Nzg0ZWRiNjc1MDc3MWNkNyJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=16045680130," } number_map = { ".notdef": -1, "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9 } async def get_page_number(url): async with aiohttp.ClientSession() as session: async with await session.get(url=url, headers=headers) as response: data = await response.read() number_list = etree.HTML(data).xpath('//div[@class="col-md-1"]/text()') # 真是的数据 number_list = [n.strip() for n in number_list] font_base = re.findall('base64,(.*?)) format', str(data, encoding='utf-8'))[0] result = base64.b64decode(font_base) # 解码之后保存 with open("字体文件.ttf", mode="wb") as f: f.write(result) # 使用TTFont打开字体文件并保存为xml文件以供阅读 font = TTFont('字体文件.ttf') font.saveXML("font.xml") font_map = font.getGlyphOrder() lists = [] for name in font_map: # getGlyphID()根据name获取GlyphID标签的id值 # 将获取到的数减一,再添加到lists列表中去 lists.append(font.getGlyphID(name) - 1) dicts = dict(zip(font_map, lists)) # 得到映射关系 for key in dicts.keys(): # 转为字符串,方便替换 dicts[str(number_map[key])] = str(dicts.pop(key)) res = 0 for n in number_list: res += eval(dicts[n[0]] + dicts[n[1]] + dicts[n[2]]) print(res) return res if __name__ == "__main__": url = "http://glidedsky.com/level/web/crawler-font-puzzle-1?page={}" res = 0 async_list = [] start_time = time.time() loop = asyncio.get_event_loop() for i in range(1, 1001): future = asyncio.ensure_future(get_page_number(url.format(i))) loop.run_until_complete(future) res += future.result() end_time = time.time() print(res) print('花费时间:{}s'.format(int(end_time-start_time)))
import requests import re from fontTools.ttLib import TTFont headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36" } url = "https://book.qidian.com/info/1018027842" response = requests.get(url=url, headers=headers) response.encoding = 'utf-8' html_data = response.text # src:; url('https://qidian.gtimg.com/qd_anti_spider/IiizhcHC.eot?') format # 匹配字体文件下载地址 font_url = re.findall("; src: url('(.*?)') format", response.text)[1] font_res = requests.get(url=font_url, headers=headers) with open("woff文件.{}".format('woff'), mode="wb") as fp: fp.write(font_res.content) font = TTFont('woff文件.woff') font.saveXML("font_woff.xml") # #获取字体映射关系 font_cmap = font['cmap'].getBestCmap() f = { 'period': '.', 'four': 4, 'three': 3, 'six': 6, 'zero': 0, 'one': 1, 'eight': 8, 'seven': 7, 'nine': 9, 'five': 5, 'two': 2} # #更改映射 for key in font_cmap: font_cmap[key] = f[font_cmap[key]] # 替换映射 for key in font_cmap: html_data = html_data.replace('&#'+str(key)+';', str(font_cmap[key])) with open("反扒成功.html", "w", encoding="utf-8") as fp: fp.write(html_data)
二、中文+数字的反爬
中文先经过一次ASCII码编码
import requests import re import base64 from fontTools.ttLib import TTFont from lxml import etree import asyncio import aiohttp import time headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36", "Cookie": "ga=GA1.2.848962462.1604591989; _gid=GA1.2.1152294063.1604591989; __gads=ID=ba9e34babfe80e7d-226f4cd795c4009d:T=1604591990:RT=1604591990:S=ALNI_MZ7HJa01m9xvgTiFBL7yZmgpblNzw; remember_web_59ba36addc2b2f9401580f014c7f58ea4e30989d=eyJpdiI6ImpKdEJkT0RDeUNUZnVUVnQ4bU1YcHc9PSIsInZhbHVlIjoiMkdjUWMyVG9yZVlGNE0rTlwvN3pIb0xSeWFLVndwbXphZURJdFFZSjAyUElROWd5OE9XRThOMGpYdW9Ha0VoOFFWUzJhT2VqeE1tNmJqYkFneE9FUE53UEFFcG1nNWxaNVQycjRSV0JheVM3Tll0czdJbE1SQVFXQSs3UUxNNUxnWTA3OTdiUDFWcnhxNVQ3bExQejNSMkpIeWJmQW01dWFKb2c1QXErbTFpOD0iLCJtYWMiOiJiNDFmODhlMTk2ZjQ4YTllZTNjMjI1M2I0YWFkYzE2OWI1Y2ZiY2RmNTg2NWI0YWIxZDE0OTg2MDdlNmI2Yzk0In0%3D; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1604591988,1604625641; XSRF-TOKEN=eyJpdiI6InRDZmRSMHpcL0EzXC9tcktPRWtBUTZVUT09IiwidmFsdWUiOiJTRkZ5OFE3QmVvQk5OemFcL29YRkFRR2p6cDN3NUN6MjFvOERoVUNpWmxyZFZSV2UyTzN0QkNyRlI4SUNIMnBZUSIsIm1hYyI6IjY2ZWIxYzZkZjhmNjQ5YTgyZDBjZmUwYTliYjQwNmU1MGE2OWMwNzM2MGEwZjU1NDUwYmFiMmIzZjBiMWJkYTUifQ%3D%3D; glidedsky_session=eyJpdiI6InFXZkoyM3l2dFZOcWRVamsxaGZZZlE9PSIsInZhbHVlIjoibGU5RjNzS3F3Z3dxTGJyVlVGdFwvMk5oT1wvaG14TGV4eFRES0IyWWU0cmhDVXVtOXlkc09OVVhPeldja2tEd2xJIiwibWFjIjoiMzMxODFlNzEwYmQzMDc0MGJkZGUyMTZlNTZkY2Y1OTg3NTZkZmU1MDc4MTRmODBmMzA2ZDJjZmFkOTdjZDRmNiJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1604628461" } number_map = { ".notdef": -1, "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9 } async def get_page_number(url): async with aiohttp.ClientSession() as session: async with await session.get(url=url, headers=headers) as response: print(url) data = await response.read() with open('源码.html', 'w', encoding='utf-8') as fp: fp.write(str(data, encoding='utf-8')) number_list = etree.HTML(data).xpath('//div[@class="col-md-1"]/text()') # 真是的数据 number_list = [n.strip() for n in number_list] font_base = re.findall('base64,(.*?)) format', str(data, encoding='utf-8'))[0] result = base64.b64decode(font_base) # 解码之后保存 with open("字体文件.ttf", mode="wb") as f: f.write(result) font = TTFont('字体文件.ttf') font.saveXML("字体文件.xml") ''' <GlyphOrder> <GlyphID id="0" name=".notdef"/> <GlyphID id="1" name="uni6C83"/> <GlyphID id="2" name="uni5DDE"/> <GlyphID id="3" name="uni8087"/> <GlyphID id="4" name="uni674F"/> <GlyphID id="5" name="uni75DE"/> <GlyphID id="6" name="uni65E7"/> <GlyphID id="7" name="uni9528"/> <GlyphID id="8" name="uni718A"/> <GlyphID id="9" name="uni5DF4"/> </GlyphOrder> <cmap> <tableVersion version="0"/> <cmap_format_4 platformID="0" platEncID="3" language="0"> <map code="0x4e27" name="uni4E27"/><!-- CJK UNIFIED IDEOGRAPH-4E27 --> <map code="0x4e59" name="uni2F04"/><!-- CJK UNIFIED IDEOGRAPH-4E59 --> <map code="0x4e91" name="uni4E91"/><!-- CJK UNIFIED IDEOGRAPH-4E91 --> <map code="0x4f38" name="uni4F38"/><!-- CJK UNIFIED IDEOGRAPH-4F38 --> </cmap> 转换流程: 文字->unicode->和map中的code匹配->的到的name再和GlyphID中name匹配->最后得到对应的id ''' font_cmap = font['cmap'].getBestCmap() font_cmap = dict(zip(font_cmap.values(), font_cmap.keys())) font_map = font.getGlyphOrder() lists = [] for name in font_map: lists.append(font.getGlyphID(name) - 1) dicts = dict(zip(font_map, lists)) dicts.pop('.notdef') tl = [] t2 = [] for key in dicts.keys(): if str(dicts[key]) in list('0123456789'): # chr()主要用来表示ascii码对应的字符他的输入时数值 # t1保存font_cmap中对应key值即acsii码对应的数值 tl.append(chr(font_cmap[key])) t2.append(dicts[key]) res_dict = dict(zip(tl, t2)) numbers_list = [eval(''.join([str(res_dict[x]) for x in list(n)])) for n in number_list] return sum(numbers_list) if __name__ == '__main__': url = 'http://glidedsky.com/level/web/crawler-font-puzzle-2?page={}' res = 0 async_list = [] start_time = time.time() loop = asyncio.get_event_loop() for i in range(1, 1001): future = asyncio.ensure_future(get_page_number(url.format(i))) loop.run_until_complete(future) res += future.result() end_time = time.time() print(res) print('花费时间:{}s'.format(int(end_time - start_time)))
对实例一的改写,但是效率仍然不高,如果你看了博客,有更好的方法可以私聊我。
import re import base64 from fontTools.ttLib import TTFont from lxml import etree import asyncio import aiohttp import time from threading import Thread headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36", "Cookie": "ga=GA1.2.848962462.1604591989; _gid=GA1.2.1152294063.1604591989; __gads=ID=ba9e34babfe80e7d-226f4cd795c4009d:T=1604591990:RT=1604591990:S=ALNI_MZ7HJa01m9xvgTiFBL7yZmgpblNzw; remember_web_59ba36addc2b2f9401580f014c7f58ea4e30989d=eyJpdiI6ImpKdEJkT0RDeUNUZnVUVnQ4bU1YcHc9PSIsInZhbHVlIjoiMkdjUWMyVG9yZVlGNE0rTlwvN3pIb0xSeWFLVndwbXphZURJdFFZSjAyUElROWd5OE9XRThOMGpYdW9Ha0VoOFFWUzJhT2VqeE1tNmJqYkFneE9FUE53UEFFcG1nNWxaNVQycjRSV0JheVM3Tll0czdJbE1SQVFXQSs3UUxNNUxnWTA3OTdiUDFWcnhxNVQ3bExQejNSMkpIeWJmQW01dWFKb2c1QXErbTFpOD0iLCJtYWMiOiJiNDFmODhlMTk2ZjQ4YTllZTNjMjI1M2I0YWFkYzE2OWI1Y2ZiY2RmNTg2NWI0YWIxZDE0OTg2MDdlNmI2Yzk0In0%3D; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1604591988,1604625641; XSRF-TOKEN=eyJpdiI6InRDZmRSMHpcL0EzXC9tcktPRWtBUTZVUT09IiwidmFsdWUiOiJTRkZ5OFE3QmVvQk5OemFcL29YRkFRR2p6cDN3NUN6MjFvOERoVUNpWmxyZFZSV2UyTzN0QkNyRlI4SUNIMnBZUSIsIm1hYyI6IjY2ZWIxYzZkZjhmNjQ5YTgyZDBjZmUwYTliYjQwNmU1MGE2OWMwNzM2MGEwZjU1NDUwYmFiMmIzZjBiMWJkYTUifQ%3D%3D; glidedsky_session=eyJpdiI6InFXZkoyM3l2dFZOcWRVamsxaGZZZlE9PSIsInZhbHVlIjoibGU5RjNzS3F3Z3dxTGJyVlVGdFwvMk5oT1wvaG14TGV4eFRES0IyWWU0cmhDVXVtOXlkc09OVVhPeldja2tEd2xJIiwibWFjIjoiMzMxODFlNzEwYmQzMDc0MGJkZGUyMTZlNTZkY2Y1OTg3NTZkZmU1MDc4MTRmODBmMzA2ZDJjZmFkOTdjZDRmNiJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1604628461" } number_map = { ".notdef": -1, "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9 } async def get_page_number(url): async with aiohttp.ClientSession() as session: async with await session.get(url=url, headers=headers) as response: print(url) data = await response.read() with open('源码.html', 'w', encoding='utf-8') as fp: fp.write(str(data, encoding='utf-8')) number_list = etree.HTML(data).xpath('//div[@class="col-md-1"]/text()') # 真是的数据 number_list = [n.strip() for n in number_list] font_base = re.findall('base64,(.*?)) format', str(data, encoding='utf-8'))[0] result = base64.b64decode(font_base) # 解码之后保存 with open("字体文件.ttf", mode="wb") as f: f.write(result) font = TTFont('字体文件.ttf') font.saveXML("字体文件.xml") font_cmap = font['cmap'].getBestCmap() font_cmap = dict(zip(font_cmap.values(), font_cmap.keys())) font_map = font.getGlyphOrder() lists = [] for name in font_map: lists.append(font.getGlyphID(name) - 1) dicts = dict(zip(font_map, lists)) dicts.pop('.notdef') tl = [] t2 = [] for key in dicts.keys(): if str(dicts[key]) in list('0123456789'): # chr()主要用来表示ascii码对应的字符他的输入时数值 # t1保存font_cmap中对应key值即acsii码对应的数值 tl.append(chr(font_cmap[key])) t2.append(dicts[key]) res_dict = dict(zip(tl, t2)) numbers_list = [eval(''.join([str(res_dict[x]) for x in list(n)])) for n in number_list] return sum(numbers_list) return 2 def cal_sum(begin): strat_page = 100*(begin-1) url = 'http://glidedsky.com/level/web/crawler-font-puzzle-2?page={}' res = 0 start_time = time.time() # loop = asyncio.get_event_loop() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) for i in range(strat_page+1, strat_page+101): future = asyncio.ensure_future(get_page_number(url.format(i))) loop.run_until_complete(future) res += future.result() end_time = time.time() print(res) print('花费时间:{}s'.format(int(end_time - start_time))) return res """ 重新定义带返回值的线程类 """ class MyThread(Thread): def __init__(self, func, args): super(MyThread, self).__init__() self.func = func self.args = args def run(self): self.result = self.func(self.args) def get_result(self): try: return self.result except Exception: return None if __name__ == '__main__': list_thred = [] res = 0 for i in range(1, 10): list_thred.append(MyThread(cal_sum, args=i)) for t in list_thred: t.start() for t in list_thred: t.join() for t in list_thred: res += t.get_result() print(res)