zoukankan      html  css  js  c++  java
  • 汽车之家反爬

    修改转换编码方式进行破解

    只是为练习字体反爬

    #!/usr/bin/env python  
    # encoding: utf-8  
    from requests_html import HTMLSession
    import re
    import os
    from fontTools.ttLib import TTFont
    
    class QiCheZhiJia():
        def __init__(self):
            self.url="https://club.autohome.com.cn/bbs/thread/bb8c36ced93ce182/74203500-1.html"
            self.hanzi=['不','了','呢','更','是','四','小','七','三','多','得','一','着','下','十','少','长','二','六','远','左','地','短','九','五','上','坏','很','右','低','高','矮','八','近','大','好','的','和']
            self.session=HTMLSession()
            self.f_dict={}
            self.uniWordList=[]
            self.utf8WordList=[]
        def create_font(self,font_url):
            # 列出已下载文件
            font_file=font_url.split('/')[-1]
    
            if not os.path.exists("./fonts"):
                os.makedirs("./fonts")
            file_list = os.listdir('./fonts')
    
            if font_file not in file_list:
                # 未下载则下载新库
                print('不在字体库中, 下载:', font_file)
                new_file = self.session.get(font_url).content
                with open('./fonts/' + font_file, 'wb') as f:
                    f.write(new_file)
                font = TTFont('./fonts/' + font_file)
            else:
                font = TTFont('./fonts/' + font_file)
                gly_list = font.getGlyphOrder()[1:]
            gly_list = font.getGlyphOrder()[1:]
            for number,gly in enumerate(gly_list):
                self.f_dict.setdefault(gly.lower().replace('uni','&#x'),self.hanzi[number])
            self.uniWordList = font['cmap'].tables[0].ttFont.getGlyphOrder()
            self.utf8WordList = [uniWord.replace("uni",r"u").encode('utf-8').decode("unicode-escape") for uniWord in self.uniWordList[1:]]
            print(self.utf8WordList)
        def run(self):
            req=self.session.get(self.url)
            source=req.text
            font_url=self.parse(source)
            self.create_font(font_url)
            info=req.html.xpath("//div[@class='tz-paragraph' and string-length(text())>1]//text()")
            print(info)
            elem=""
            for item in info:
                elem += item
            for i in range(len(self.utf8WordList)):
                # 将自定的字体信息,替换成国际标准
                elem = elem.replace(self.utf8WordList[i], self.hanzi[i])
            print(elem)
    
        def parse(self,source):
            plat=re.compile("'),url('(.*?)')")
            font_url="http:"+plat.findall(source)[0]
            return font_url
    if __name__ == '__main__':
        QiCheZhiJia().run()
    
  • 相关阅读:
    POJ 1887 Testing the CATCHER
    HDU 3374 String Problem
    HDU 2609 How many
    POJ 1509 Glass Beads
    POJ 1458 Common Subsequence
    POJ 1159 Palindrome
    POJ 1056 IMMEDIATE DECODABILITY
    POJ 3080 Blue Jeans
    POJ 1200 Crazy Search
    软件体系结构的艺术阅读笔记1
  • 原文地址:https://www.cnblogs.com/c-x-a/p/9288841.html
Copyright © 2011-2022 走看看