zoukankan      html  css  js  c++  java
  • 爬虫实战项目二、字体反爬

    一、数字反爬

    乱码的原因:style中加载了字体文件,这字体文件中包含着加密方法
    大致思路: 1、请求页面
          2、获取加密的字体库
          3、解析字体库,获取字体间的映射关系
          4、获取加密的字体,获取字体间映射关系,一一对应
    
    import requests
    import re
    import base64
    from fontTools.ttLib import TTFont
    from lxml import etree
    import asyncio
    import aiohttp
    import time
    headers = {
        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36",
        "Cookie": "footprints=eyJpdiI6IkhNbHRoM0lTdWdYWnJIcW9PZ1E0dkE9PSIsInZhbHVlIjoiWG5SZDVwWkpsekxNdERIRjZQeHgyY1JzWVFxWGpVMUFYMjV2NlNHVFBBaVRcL3F5akNYYU5nc0RNM2VzTUN2YWYiLCJtYWMiOiI2ODEyNTg5NjYxMDBkNzZjNzMwMWE0ZTkwM2FlZWU3MzVlNTNmMzE2ZGUzNzRiMzM2NmNlODg2NmNhOWMzOGRmIn0%3D; _ga=GA1.2.907203747.1604565923; _gid=GA1.2.1910542827.1604565923; __gads=ID=e84726596f3c4e4d-2245fc118dc4006a:T=1604565923:RT=1604565923:S=ALNI_MbZHQMLD_QOArjXJ8cXuhVtK_C-Zw; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1604565922,1604567153; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IlRBSTlUZDNNM0kyRE40ZlF2T2xjQWc9PSIsInZhbHVlIjoiK0dCRTBTRUN1dWNJcjZPclNHemwrYnhKQUlHRWhyWUJpZjJQb0JDZEliZXNpeFwvYjM1Y2VTdXl5c2xaaHlmWmkiLCJtYWMiOiIzMWIwNGQ2MWYyNDUwNWIwYzFmY2RjZmQ0NGYyOGNkYmRmMDBhMzg4YWVlNGRiOWE2MWNkMDZkZmEzNzg2NTk3In0%3D; glidedsky_session=eyJpdiI6InRuMm44ZlwvemRxdmJ1dEJpVXdpbSt3PT0iLCJ2YWx1ZSI6ImowQnVmeUx5NGRZMUxIcERJSHRSMW84bExib09rRzNhbDZFMERwYXYrRW9cL3JlTVljK0c5M05CSzJGN21YandZIiwibWFjIjoiOWUwZGRiZGI1OTBlMjZlMDY1MTAzNWVmOTI5Yjg5NWFhYmFmMTdjODdlYTg4ZDc2Nzg0ZWRiNjc1MDc3MWNkNyJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=16045680130,"
    }
    
    number_map = {
            ".notdef": -1,
            "zero": 0,
            "one": 1,
            "two": 2,
            "three": 3,
            "four": 4,
            "five": 5,
            "six": 6,
            "seven": 7,
            "eight": 8,
            "nine": 9
        }
    
    async def get_page_number(url):
        async with aiohttp.ClientSession() as session:
            async with await session.get(url=url, headers=headers) as response:
                data = await response.read()
                number_list = etree.HTML(data).xpath('//div[@class="col-md-1"]/text()')
                # 真是的数据
                number_list = [n.strip() for n in number_list]
    
                font_base = re.findall('base64,(.*?)) format', str(data, encoding='utf-8'))[0]
                result = base64.b64decode(font_base)
                # 解码之后保存
                with open("字体文件.ttf", mode="wb") as f:
                    f.write(result)
    
                # 使用TTFont打开字体文件并保存为xml文件以供阅读
                font = TTFont('字体文件.ttf')
                font.saveXML("font.xml")
    
                font_map = font.getGlyphOrder()
                lists = []
                for name in font_map:
                    # getGlyphID()根据name获取GlyphID标签的id值
                    # 将获取到的数减一,再添加到lists列表中去
                    lists.append(font.getGlyphID(name) - 1)
                dicts = dict(zip(font_map, lists))
    
                # 得到映射关系
                for key in dicts.keys():
                    # 转为字符串,方便替换
                    dicts[str(number_map[key])] = str(dicts.pop(key))
                res = 0
                for n in number_list:
                    res += eval(dicts[n[0]] + dicts[n[1]] + dicts[n[2]])
                print(res)
                return res
    if __name__ == "__main__":
        url = "http://glidedsky.com/level/web/crawler-font-puzzle-1?page={}"
        res = 0
        async_list = []
        start_time = time.time()
        loop = asyncio.get_event_loop()
        for i in range(1, 1001):
            future = asyncio.ensure_future(get_page_number(url.format(i)))
            loop.run_until_complete(future)
            res += future.result()
        end_time = time.time()
        print(res)
        print('花费时间:{}s'.format(int(end_time-start_time)))
    实例一
    import requests
    import re
    from fontTools.ttLib import TTFont
    
    headers = {
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
    }
    
    url = "https://book.qidian.com/info/1018027842"
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    
    html_data = response.text
    
    # src:; url('https://qidian.gtimg.com/qd_anti_spider/IiizhcHC.eot?') format
    # 匹配字体文件下载地址
    font_url = re.findall("; src: url('(.*?)') format", response.text)[1]
    
    font_res = requests.get(url=font_url, headers=headers)
    with open("woff文件.{}".format('woff'), mode="wb") as fp:
        fp.write(font_res.content)
    font = TTFont('woff文件.woff')
    font.saveXML("font_woff.xml")
    
    
    # #获取字体映射关系
    font_cmap = font['cmap'].getBestCmap()
    f = {
        'period': '.',
        'four': 4,
        'three': 3,
        'six': 6,
        'zero': 0,
        'one': 1,
        'eight': 8,
        'seven': 7,
        'nine': 9,
        'five': 5,
        'two': 2}
    # #更改映射
    for key in font_cmap:
        font_cmap[key] = f[font_cmap[key]]
    
    # 替换映射
    for key in font_cmap:
        html_data = html_data.replace('&#'+str(key)+';', str(font_cmap[key]))
    with open("反扒成功.html", "w", encoding="utf-8") as fp:
        fp.write(html_data)
    实例二

     二、中文+数字的反爬

    中文先经过一次ASCII码编码
    import requests
    import re
    import base64
    from fontTools.ttLib import TTFont
    from lxml import etree
    import asyncio
    import aiohttp
    import time
    headers = {
        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36",
        "Cookie": "ga=GA1.2.848962462.1604591989; _gid=GA1.2.1152294063.1604591989; __gads=ID=ba9e34babfe80e7d-226f4cd795c4009d:T=1604591990:RT=1604591990:S=ALNI_MZ7HJa01m9xvgTiFBL7yZmgpblNzw; remember_web_59ba36addc2b2f9401580f014c7f58ea4e30989d=eyJpdiI6ImpKdEJkT0RDeUNUZnVUVnQ4bU1YcHc9PSIsInZhbHVlIjoiMkdjUWMyVG9yZVlGNE0rTlwvN3pIb0xSeWFLVndwbXphZURJdFFZSjAyUElROWd5OE9XRThOMGpYdW9Ha0VoOFFWUzJhT2VqeE1tNmJqYkFneE9FUE53UEFFcG1nNWxaNVQycjRSV0JheVM3Tll0czdJbE1SQVFXQSs3UUxNNUxnWTA3OTdiUDFWcnhxNVQ3bExQejNSMkpIeWJmQW01dWFKb2c1QXErbTFpOD0iLCJtYWMiOiJiNDFmODhlMTk2ZjQ4YTllZTNjMjI1M2I0YWFkYzE2OWI1Y2ZiY2RmNTg2NWI0YWIxZDE0OTg2MDdlNmI2Yzk0In0%3D; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1604591988,1604625641; XSRF-TOKEN=eyJpdiI6InRDZmRSMHpcL0EzXC9tcktPRWtBUTZVUT09IiwidmFsdWUiOiJTRkZ5OFE3QmVvQk5OemFcL29YRkFRR2p6cDN3NUN6MjFvOERoVUNpWmxyZFZSV2UyTzN0QkNyRlI4SUNIMnBZUSIsIm1hYyI6IjY2ZWIxYzZkZjhmNjQ5YTgyZDBjZmUwYTliYjQwNmU1MGE2OWMwNzM2MGEwZjU1NDUwYmFiMmIzZjBiMWJkYTUifQ%3D%3D; glidedsky_session=eyJpdiI6InFXZkoyM3l2dFZOcWRVamsxaGZZZlE9PSIsInZhbHVlIjoibGU5RjNzS3F3Z3dxTGJyVlVGdFwvMk5oT1wvaG14TGV4eFRES0IyWWU0cmhDVXVtOXlkc09OVVhPeldja2tEd2xJIiwibWFjIjoiMzMxODFlNzEwYmQzMDc0MGJkZGUyMTZlNTZkY2Y1OTg3NTZkZmU1MDc4MTRmODBmMzA2ZDJjZmFkOTdjZDRmNiJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1604628461"
    }
    
    number_map = {
            ".notdef": -1,
            "zero": 0,
            "one": 1,
            "two": 2,
            "three": 3,
            "four": 4,
            "five": 5,
            "six": 6,
            "seven": 7,
            "eight": 8,
            "nine": 9
        }
    async def get_page_number(url):
        async with aiohttp.ClientSession() as session:
            async with await session.get(url=url, headers=headers) as response:
                print(url)
                data = await response.read()
                with open('源码.html', 'w', encoding='utf-8') as fp:
                    fp.write(str(data, encoding='utf-8'))
                number_list = etree.HTML(data).xpath('//div[@class="col-md-1"]/text()')
                # 真是的数据
                number_list = [n.strip() for n in number_list]
                font_base = re.findall('base64,(.*?)) format', str(data, encoding='utf-8'))[0]
                result = base64.b64decode(font_base)
                # 解码之后保存
                with open("字体文件.ttf", mode="wb") as f:
                    f.write(result)
    
                font = TTFont('字体文件.ttf')
                font.saveXML("字体文件.xml")
                '''
                <GlyphOrder>
                    <GlyphID id="0" name=".notdef"/>
                    <GlyphID id="1" name="uni6C83"/>
                    <GlyphID id="2" name="uni5DDE"/>
                    <GlyphID id="3" name="uni8087"/>
                    <GlyphID id="4" name="uni674F"/>
                    <GlyphID id="5" name="uni75DE"/>
                    <GlyphID id="6" name="uni65E7"/>
                    <GlyphID id="7" name="uni9528"/>
                    <GlyphID id="8" name="uni718A"/>
                    <GlyphID id="9" name="uni5DF4"/>
                </GlyphOrder>
                <cmap>
                    <tableVersion version="0"/>
                    <cmap_format_4 platformID="0" platEncID="3" language="0">
                      <map code="0x4e27" name="uni4E27"/><!-- CJK UNIFIED IDEOGRAPH-4E27 -->
                      <map code="0x4e59" name="uni2F04"/><!-- CJK UNIFIED IDEOGRAPH-4E59 -->
                      <map code="0x4e91" name="uni4E91"/><!-- CJK UNIFIED IDEOGRAPH-4E91 -->
                      <map code="0x4f38" name="uni4F38"/><!-- CJK UNIFIED IDEOGRAPH-4F38 -->
                </cmap>
                转换流程: 文字->unicode->和map中的code匹配->的到的name再和GlyphID中name匹配->最后得到对应的id
                '''
                font_cmap = font['cmap'].getBestCmap()
                font_cmap = dict(zip(font_cmap.values(), font_cmap.keys()))
                font_map = font.getGlyphOrder()
                lists = []
                for name in font_map:
                   lists.append(font.getGlyphID(name) - 1)
                dicts = dict(zip(font_map, lists))
                dicts.pop('.notdef')
                tl = []
                t2 = []
                for key in dicts.keys():
                    if str(dicts[key]) in list('0123456789'):
                        # chr()主要用来表示ascii码对应的字符他的输入时数值
                        # t1保存font_cmap中对应key值即acsii码对应的数值
                        tl.append(chr(font_cmap[key]))
                        t2.append(dicts[key])
                res_dict = dict(zip(tl, t2))
                numbers_list = [eval(''.join([str(res_dict[x]) for x in list(n)])) for n in number_list]
                return sum(numbers_list)
    
    
    
    if __name__ == '__main__':
        url = 'http://glidedsky.com/level/web/crawler-font-puzzle-2?page={}'
        res = 0
        async_list = []
        start_time = time.time()
        loop = asyncio.get_event_loop()
        for i in range(1, 1001):
            future = asyncio.ensure_future(get_page_number(url.format(i)))
            loop.run_until_complete(future)
            res += future.result()
        end_time = time.time()
        print(res)
        print('花费时间:{}s'.format(int(end_time - start_time)))
    实例一

    对实例一的改写,但是效率仍然不高,如果你看了博客,有更好的方法可以私聊我。

    import re
    import base64
    from fontTools.ttLib import TTFont
    from lxml import etree
    import asyncio
    import aiohttp
    import time
    from threading import Thread
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36",
        "Cookie": "ga=GA1.2.848962462.1604591989; _gid=GA1.2.1152294063.1604591989; __gads=ID=ba9e34babfe80e7d-226f4cd795c4009d:T=1604591990:RT=1604591990:S=ALNI_MZ7HJa01m9xvgTiFBL7yZmgpblNzw; remember_web_59ba36addc2b2f9401580f014c7f58ea4e30989d=eyJpdiI6ImpKdEJkT0RDeUNUZnVUVnQ4bU1YcHc9PSIsInZhbHVlIjoiMkdjUWMyVG9yZVlGNE0rTlwvN3pIb0xSeWFLVndwbXphZURJdFFZSjAyUElROWd5OE9XRThOMGpYdW9Ha0VoOFFWUzJhT2VqeE1tNmJqYkFneE9FUE53UEFFcG1nNWxaNVQycjRSV0JheVM3Tll0czdJbE1SQVFXQSs3UUxNNUxnWTA3OTdiUDFWcnhxNVQ3bExQejNSMkpIeWJmQW01dWFKb2c1QXErbTFpOD0iLCJtYWMiOiJiNDFmODhlMTk2ZjQ4YTllZTNjMjI1M2I0YWFkYzE2OWI1Y2ZiY2RmNTg2NWI0YWIxZDE0OTg2MDdlNmI2Yzk0In0%3D; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1604591988,1604625641; XSRF-TOKEN=eyJpdiI6InRDZmRSMHpcL0EzXC9tcktPRWtBUTZVUT09IiwidmFsdWUiOiJTRkZ5OFE3QmVvQk5OemFcL29YRkFRR2p6cDN3NUN6MjFvOERoVUNpWmxyZFZSV2UyTzN0QkNyRlI4SUNIMnBZUSIsIm1hYyI6IjY2ZWIxYzZkZjhmNjQ5YTgyZDBjZmUwYTliYjQwNmU1MGE2OWMwNzM2MGEwZjU1NDUwYmFiMmIzZjBiMWJkYTUifQ%3D%3D; glidedsky_session=eyJpdiI6InFXZkoyM3l2dFZOcWRVamsxaGZZZlE9PSIsInZhbHVlIjoibGU5RjNzS3F3Z3dxTGJyVlVGdFwvMk5oT1wvaG14TGV4eFRES0IyWWU0cmhDVXVtOXlkc09OVVhPeldja2tEd2xJIiwibWFjIjoiMzMxODFlNzEwYmQzMDc0MGJkZGUyMTZlNTZkY2Y1OTg3NTZkZmU1MDc4MTRmODBmMzA2ZDJjZmFkOTdjZDRmNiJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1604628461"
    }
    
    number_map = {
            ".notdef": -1,
            "zero": 0,
            "one": 1,
            "two": 2,
            "three": 3,
            "four": 4,
            "five": 5,
            "six": 6,
            "seven": 7,
            "eight": 8,
            "nine": 9
        }
    
    async def get_page_number(url):
        async with aiohttp.ClientSession() as session:
            async with await session.get(url=url, headers=headers) as response:
                print(url)
                data = await response.read()
                with open('源码.html', 'w', encoding='utf-8') as fp:
                    fp.write(str(data, encoding='utf-8'))
                number_list = etree.HTML(data).xpath('//div[@class="col-md-1"]/text()')
                # 真是的数据
                number_list = [n.strip() for n in number_list]
                font_base = re.findall('base64,(.*?)) format', str(data, encoding='utf-8'))[0]
                result = base64.b64decode(font_base)
                # 解码之后保存
                with open("字体文件.ttf", mode="wb") as f:
                    f.write(result)
    
                font = TTFont('字体文件.ttf')
                font.saveXML("字体文件.xml")
                font_cmap = font['cmap'].getBestCmap()
                font_cmap = dict(zip(font_cmap.values(), font_cmap.keys()))
                font_map = font.getGlyphOrder()
                lists = []
                for name in font_map:
                   lists.append(font.getGlyphID(name) - 1)
                dicts = dict(zip(font_map, lists))
                dicts.pop('.notdef')
                tl = []
                t2 = []
                for key in dicts.keys():
                    if str(dicts[key]) in list('0123456789'):
                        # chr()主要用来表示ascii码对应的字符他的输入时数值
                        # t1保存font_cmap中对应key值即acsii码对应的数值
                        tl.append(chr(font_cmap[key]))
                        t2.append(dicts[key])
                res_dict = dict(zip(tl, t2))
                numbers_list = [eval(''.join([str(res_dict[x]) for x in list(n)])) for n in number_list]
                return sum(numbers_list)
                return 2
    
    def cal_sum(begin):
        strat_page = 100*(begin-1)
    
        url = 'http://glidedsky.com/level/web/crawler-font-puzzle-2?page={}'
        res = 0
        start_time = time.time()
        # loop = asyncio.get_event_loop()
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        for i in range(strat_page+1, strat_page+101):
            future = asyncio.ensure_future(get_page_number(url.format(i)))
            loop.run_until_complete(future)
            res += future.result()
        end_time = time.time()
        print(res)
        print('花费时间:{}s'.format(int(end_time - start_time)))
        return res
    
    """
    重新定义带返回值的线程类
    """
    class MyThread(Thread):
        def __init__(self, func, args):
            super(MyThread, self).__init__()
            self.func = func
            self.args = args
    
        def run(self):
            self.result = self.func(self.args)
    
        def get_result(self):
            try:
                return self.result
            except Exception:
                return None
    
    
    if __name__ == '__main__':
        list_thred = []
        res = 0
        for i in range(1, 10):
            list_thred.append(MyThread(cal_sum, args=i))
        for t in list_thred:
            t.start()
        for t in list_thred:
            t.join()
        for t in list_thred:
            res += t.get_result()
        print(res)
    实例二,对实例一的改写
  • 相关阅读:
    超大文件排序
    透彻理解迪杰斯特拉算法
    Floyd-傻子也能看懂的弗洛伊德算法(转)
    轻松实现在浏览器上播放本地视频
    Caffeine缓存处理
    每日日报94
    每日日报93
    下载安装SQL server2008的步骤
    每日日报92
    每日日报91
  • 原文地址:https://www.cnblogs.com/854594834-YT/p/13934747.html
Copyright © 2011-2022 走看看