教程仅供技术研究学习使用,若有侵权,联系本人删除
以 https://www.toutiao.com/c/user/59672551416/#mid=1566273643580418 为例
1: 破解as、cp
使用万能的 alt+F
将js代码改写为python代码
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author: zhibo.wang # E-mail: gm.zhibo.wang@gmail.com # Date : 20/07/06 11:36:11 # Desc : """ https://s3.pstatp.com/toutiao/resource/ntoutiao_web/static/js/common/lib_6b19209.js i = window.byted_acrawler && window.byted_acrawler.sign ? window.byted_acrawler.sign(o) : "" """ import hashlib import math import re import time def get_as_cp(): """ as cp js生成规则 https://s3.pstatp.com/toutiao/resource/ntoutiao_web/page/profile/index_ae91792.js function(i) { var e = {}; e.getHoney = function() { var i = Math.floor((new Date).getTime() / 1e3), e = i.toString(16).toUpperCase(), t = md5(i).toString().toUpperCase(); if (8 != e.length) return { as: "479BB4B7254C150", cp: "7E0AC8874BB0985" }; for (var o = t.slice(0, 5), n = t.slice( - 5), a = "", s = 0; 5 > s; s++) a += o[s] + e[s]; for (var r = "", c = 0; 5 > c; c++) r += e[c + 3] + n[c]; return { as: "A1" + a + e.slice( - 3), cp: e.slice(0, 3) + r + "E1" } }, i.ascp = e } """ t = int(math.floor(time.time())) e = hex(t).upper()[2:] m = hashlib.md5() m.update(str(t).encode(encoding='utf-8')) i = m.hexdigest().upper() if len(e) != 8: AS = '479BB4B7254C150' CP = '7E0AC8874BB0985' return AS, CP n = i[0:5] a = i[-5:] s = '' r = '' for o in range(5): s += n[o] + e[o] r += e[o + 3] + a[o] AS = 'A1' + s + e[-3:] CP = e[0:3] + r + 'E1' return AS, CP
max_behot_time 参数 第一页给 0 后面多页 请给请求后返回数据中的 max_behot_time 值
_signature参数 也是最难处理的
全局搜索,打断点找到 生成的文件
https://s3.pstatp.com/toutiao/resource/ntoutiao_web/static/js/common/lib_6b19209.js i = window.byted_acrawler && window.byted_acrawler.sign ? window.byted_acrawler.sign(o) : "";
跟着断点一直走发现最终生成的文件是 VM621 也就是下面这张截图
将此文件内容全部拷贝 写入 sign.js中
const jsdom = require("jsdom"); const { JSDOM } = jsdom; const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`); window = global; baseurl = process.argv[2] cookies = process.argv[3] ua = process.argv[4] var document = dom.window.document; var params = { location:{ hash: "#mid=5954781019", host: "www.toutiao.com", hostname: "www.toutiao.com", href: "https://www.toutiao.com/c/user/59672551416/#mid=1566273643580418", origin: "https://www.toutiao.com", pathname: "/c/user/59672551416/#mid=1566273643580418", port: "", protocol: "https:", search: "", }, navigator:{ appCodeName: "Mozilla", appName: "Netscape", appVersion: "5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36", cookieEnabled: true, deviceMemory: 8, doNotTrack: null, hardwareConcurrency: 4, language: "zh-CN", languages: ["zh-CN", "zh"], maxTouchPoints: 0, onLine: true, platform: "Win32", product: "Gecko", productSub: "20030107", userAgent: ua, vendor: "Google Inc.", vendorSub: "", }, "screen":{ availHeight: 1040, availLeft: 0, availTop: 0, availWidth: 1920, colorDepth: 24, height: 1080, pixelDepth: 24, 1920, } }; Object.assign(window,params); function setCookie(name, value, seconds) { seconds = seconds || 0; var expires = ""; if (seconds != 0 ) { var date = new Date(); date.setTime(date.getTime()+(seconds*1000)); expires = "; expires="+date.toGMTString(); } document.cookie = name+"="+escape(value)+expires+"; path=/"; } //cookies = "csrftoken=a6f078a275e9f39b0addfb9df37fd890; tt_webid=6856639657595241992; s_v_web_id=verify_kde4odjw_isOUm41W_VbRS_4WS0_BrtZ_Ch1KLo5pkNV5;tasessionId=ownu4mas91596435834562; ttcid=1de8f696daab43dc8eb818a02408bd6930; tt_scid=P.PuhA.5OslBUeRVIYUAYFS--vw9l9LTWpc4-b4r7prsBwQ2X6extVf1PCjkhCNWc102" for(let cookie of cookies.split(";")){ tmp = cookie.split("="); setCookie(tmp[0],tmp[1],1800); } window.document = document; //将拷贝的内容放在这里 window.byted_acrawler && window.byted_acrawler.init({ aid: 24, dfp: true, }) //sign = window.byted_acrawler.sign({url:"https://www.toutiao.com/api/pc/media_hot/?media_id=1566273643580418&user_id=59672551416"}); sign = window.byted_acrawler.sign({url:baseurl}); console.log(sign);
请安装jsdom npm i -g jsdom
#!/usr/bin/env python # -*- coding:utf-8 -*- import os import time import math import hashlib import requests def getHoney(): i = math.floor(time.time()) e = str('%X' % i) md5 = hashlib.md5() md5.update(str(i).encode('utf-8')) t = str(md5.hexdigest()).upper() if 8 != len(e): return { 'as':"479BB4B7254C150", 'cp':"7E0AC8874BB0985" } o = t[0:5] n = t[-5:] a = '' r = '' for i in range(5): a += o[i] + e[i] r += e[i + 3] + n[i] return { 'as':"A1" + a + e[-3:], 'cp':e[0:3] + r + "E1" } def get_signature(url, cookies, ua): sign = os.popen('node sign.js {url} {cookies} {ua}'.format( url=url, cookies=cookies, ua=ua) ).read() return "&_signature=" + sign if __name__ == '__main__': headers = { 'Referer':'https://www.toutiao.com/', 'authority': 'www.toutiao.com', 'method': 'GET', 'path': '/c/user/59672551416/', 'scheme': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'cookie': 'csrftoken=a6f078a275e9f39b0addfb9df37fd890; s_v_web_id=verify_kde4odjw_isOUm41W_VbRS_4WS0_BrtZ_Ch1KLo5pkNV5; ttcid=1de8f696daab43dc8eb818a02408bd6930; SLARDAR_WEB_ID=c7f55d5c-4dba-493d-a126-ce8e36b472bf; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6856656984092460551; tt_webid=6856656984092460551; __tasessionId=61hz1rirw1596442527425; tt_scid=UD3a5jP-6nL7yUaAawB2lLtCdtv430T-TJyynultVAGY6J4cY6KXTiH1QRWAYhb9e1f5', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'none', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } base_url = 'https://www.toutiao.com/toutiao' param = '/c/user/article/?page_type=1&user_id=59672551416&max_behot_time=0&count=20&as={as}&cp={cp}'.format(**getHoney()) base_url += param signature = get_signature( base_url, headers["cookie"], headers["user-agent"] ) path = param + signature headers['path'] = path url = base_url + signature print(url) response = requests.get(url=url,headers=headers) print(response.text)
python test.py