zoukankan      html  css  js  c++  java
  • 百度翻译爬虫-Web版(自动生成sign)

     1 # 面向对象
     2 # 百度翻译 -- 网页版(自动获取token,sign)
     3 import requests
     4 import js2py
     5 import json
     6 import re
     7 
     8 
     9 class WebFanyi:
    10     """百度翻译网页版爬虫"""
    11     def __init__(self,query_str):
    12         self.session = requests.session()
    13         headers = {
    14             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
    15         }
    16         self.session.headers = headers
    17         self.baidu_url = "https://www.baidu.com/"
    18         self.root_url = "https://fanyi.baidu.com/"
    19         self.lang_url = "https://fanyi.baidu.com/langdetect"
    20         self.trans_url = "https://fanyi.baidu.com/v2transapi"
    21         self.query_str = query_str
    22 
    23     def get_token_gtk(self):
    24         '''获取token和gtk(用于合成Sign)'''
    25         self.session.get(self.root_url)
    26         resp = self.session.get(self.root_url)
    27         html_str = resp.content.decode()
    28         token = re.findall(r"token: '(.*?)'", html_str)[0]
    29         gtk = re.findall(r"window.gtk = '(.*?)'", html_str)[0]
    30         return token,gtk
    31 
    32     def generate_sign(self,gtk):
    33         """生成sign"""
    34         # 1. 准备js编译环境
    35         context = js2py.EvalJs()
    36         with open('webtrans.js', encoding='utf8') as f:
    37             js_data = f.read()
    38             js_data = re.sub("window[l]",'"'+gtk+'"',js_data)
    39             # js_data = re.sub("window[l]", ""{}"".format(gtk), js_data)
    40             # print(js_data)
    41             context.execute(js_data)
    42         sign = context.e(self.query_str)
    43         return sign
    44 
    45     def lang_detect(self):
    46         '''获取语言转换类型.eg: zh-->en'''
    47         lang_resp = self.session.post(self.lang_url,data={"query":self.query_str})
    48         lang_json_str = lang_resp.content.decode()  # {"error":0,"msg":"success","lan":"zh"}
    49         lan = json.loads(lang_json_str)['lan']
    50         to = "en" if lan == "zh" else "zh"
    51         return lan,to
    52 
    53 
    54     def parse_url(self,post_data):
    55         trans_resp = self.session.post(self.trans_url,data=post_data)
    56         trans_json_str = trans_resp.content.decode()
    57         trans_json = json.loads(trans_json_str)
    58         result = trans_json["trans_result"]["data"][0]["dst"]
    59         print("{}: {}".format(self.query_str,result))
    60 
    61     def run(self):
    62         """实现逻辑"""
    63         # 1.获取百度的cookie,(缺乏百度首页的cookie会始终报错998)
    64         self.session.get(self.baidu_url)
    65         # 2. 获取百度翻译的token和gtk(用于合成sign)
    66         token, gtk = self.get_token_gtk()
    67         # 3. 生成sign
    68         sign = self.generate_sign(gtk)
    69         # 4. 获取语言转换类型.eg: zh-->en
    70         lan, to = self.lang_detect()
    71         # 5. 发送请求,获取响应,输出结果
    72         post_data = {
    73             "from": lan,
    74             "to": to,
    75             "query": self.query_str,
    76             "transtype": "realtime",
    77             "simple_means_flag": 3,
    78             "sign": sign,
    79             "token": token
    80         }
    81         self.parse_url(post_data)
    82 
    83 if __name__ == '__main__':
    84     webfanyi = WebFanyi('lover')
    85     webfanyi.run()

    上述代码中用于生成sign的 webtrans.js 文件具体代码如下(可以自己抓包,在js中打断点获取):

     1 // webtrans.js
     2 
     3 function n(r, o) {
     4     for (var t = 0; t < o.length - 2; t += 3) {
     5         var a = o.charAt(t + 2);
     6         a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
     7         a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
     8         r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
     9     }
    10     return r
    11 }
    12 function e(r) {
    13     var o = r.match(/[uD800-uDBFF][uDC00-uDFFF]/g);
    14     if (null === o) {
    15         var t = r.length;
    16         t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
    17     } else {
    18         for (var e = r.split(/[uD800-uDBFF][uDC00-uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
    19             "" !== e[C] && f.push.apply(f, a(e[C].split(""))),
    20             C !== h - 1 && f.push(o[C]);
    21         var g = f.length;
    22         g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
    23     }
    24     var u = void 0
    25       ,
    26         // l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
    27         i = null;
    28         u = null !== i ? i : (i = window[l] || "") || "";
    29     for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
    30         var A = r.charCodeAt(v);
    31         128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
    32         S[c++] = A >> 18 | 240,
    33         S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
    34         S[c++] = A >> 6 & 63 | 128),
    35         S[c++] = 63 & A | 128)
    36     }
    37     for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
    38         p += S[b],
    39         p = n(p, F);
    40     return p = n(p, D),
    41     p ^= s,
    42     0 > p && (p = (2147483647 & p) + 2147483648),
    43     p %= 1e6,
    44     p.toString() + "." + (p ^ m)
    45 }

    实际上,除了用js2py作为python中执行js代码的环境编译器外,还可以使用另一个方法 'execjs' ,不过要先通过 pip install PyExecJS 安装PyExecJS模块.具体实现代码如下:

    1 import execjs
    2 with open("webtrans.js") as f:
    3     js_data = f.read()
    4     js_data = re.sub("window[l]", '"' + gtk + '"', js_data)
    5     sign = execjs.compile(js_data).call("e", query_str)  # 调用webtrans.js代码中的 e函数,传入参数为 query_str
    6     print(sign)
    <人追求理想之时,便是坠入孤独之际.> By 史泰龙
  • 相关阅读:
    java中<> 的用法
    Java 中 compareTo方法问题
    class AClass<E extends Comparable>与class AClass<E extends Comaprable<E>>有什么区别?
    zookeeper常用命令
    Storm概念讲解和工作原理介绍
    Storm集群安装部署步骤【详细版】
    Error contacting service. It is probably not running.
    Exception in thread "main" expected '<document start>', but found BlockMappingStart in 'reader', line 23, column 2: nimbus.host: "master"
    zookeeper 启动失败 BindException: Address already in use 或者Error contacting service. It is probably not running
    scss的安装使用
  • 原文地址:https://www.cnblogs.com/jason-Gan/p/10567018.html
Copyright © 2011-2022 走看看