  • 今日头条反反爬思路总结


      今日头条 (www.toutiao.com)


    • 首先使用浏览器访问 www.toutiao.com 进入头条主页, 在向下滚动窗口查看更多内容时, 发现新内容是动态加载的, 利用快捷键 F12 打开浏览器开发者工具监控 Elements 面板下的 DOM 树, 发现动态生成的标签, 进一步验证得知头条主页内容采用异步请求动态加载
    • 接下来分析网络请求
      • 快捷键 Ctrl + F5 重新加载页面资源, 切换到 Network 面板下 XHR 选项卡查看所有的 XHR 类型的请求
      • 初始的所有 XHR 类型请求
      • 向下滚动窗口, 直到加载新的内容停止滚动, 此时出现新的 XHR 类型请求
      • 发现两个很相似的请求
      • 点击查看其详细信息
        • Headers 中包含了 Request Headers(请求头), Query String Parameters(查询字符串参数)
        • ?min_behot_time=156955... 如下
        • ?max_behot_time=0.... 如下
        • 对比后发现这两个请求只有部分请求参数是变动的
          • min_behot_time  /  max_behot_time 
          • as
          • cp
        • 接下来的分析着重考虑这三个参数的生成机制
        • Preview (预览响应内容) 查看响应数据和其结构
        • ?min_behot_time=0.... 如下
        • 发现 min_behot_time=... 这个请求获得的响应中包含 ?max_behot_time=... 这个请求需要的max_behot_time 请求参数(next: {max_behot_time: 1569556156})
        • 分析 min_behot_time=... 这个请求可能为获取到初始的动态加载内容
        • ?min_behot_time=0.... 如下
        • ?max_behot_time=0.... 如下
        • 利用正则全局搜索参数 as  cp 
        • 发现一个名为index.d337d64118bf9b864485.js的文件中存在匹配项
        • 继而发现自定制的加密算法
        • 接下来采用 Debug 调试 JS 代码, 了解上述参数的具体生成机制

        • 首先找到 index.d337d64118bf9b864485.js 文件


        • 打开文件, 接着找到加密函数 a() 添加断点 (了解其机制)
        • 接下来要找到 ?min_behot_time=... 和 ?max_behot_time=... 这两个异步请求是如何发起的
        • Initiator 标记请求是由哪个对象或进程发起的 (请求源)
          • 1.跳转到 Sources 面板:

          • 2.查看格式化后的代码, 发现发起请求的外层函数 l(t) , 添加断点进行调试(了解其机制)

          • 3.清空所有 XHR 类型的请求:

          • 清空后:

          • 4.快捷键 Ctrl + F5 重新加载页面, 在第一个断点处暂停

          • 5. 这时查看所有的 XHR 类型的请求
          • 证明了上面的分析结果: "分析 min_behot_time=... 这个请求可能为获取到初始的动态加载内容"
          • 6. 现在开始 Debug

          • 获取请求路径
          • 获取动态参数ascp

          • 变量 i 赋值时 (0, o.default)(t) 发生跳转, 相当于执行了 k(t)
          • 调用 k(t) 后一系列连锁调用

          • 分析上面的代码发现其非常类似 MD5 信息摘要算法

          • 可查询md5.js进行比对
          • 获得结果值
          • 利用Python 标准库 hashlib 验证结果是否与以上分析相吻合
          • 结果相同, 证明上述分析正确
          • 接下来开始拼接查询参数
          • 开始构建异步请求
          • 设置请求头
          • 发起请求后回到最初断点处
          • 完成上述步骤后Network面板下XHR选项卡查看到 ?min_behot_time=0 请求已完成, 其中查询字符串参数完全符和上述步骤中所生成的动态参数
      • 到此分析过程结束


    # 导入相关模块
    import time
    import math
    import datetime
    import json
    import hashlib
    from urllib.parse import urlencode
    import execjs
    import requests
    import xlsxwriter
    from pymongo import MongoClient
    # 基本配置
    # 创建数据库连接
    client = MongoClient("localhost", 27017)
    # 初始化数据库
    db = client["Toutiaopro"]
    # 目标地址
    url = "https://www.toutiao.com/"
    # 请求头
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
            "AppleWebKit/537.36 (KHTML, like Gecko)"
            "Chrome/79.0.3907.0 Safari/537.36 Edg/"
    # 实例化 session 对象
    session = requests.Session()
    # 获取 cookies
    session.get(url=url, headers=headers)
    # 指定初始 url
    start_url = "https://www.toutiao.com/api/pc/feed/?{}&category=__all__&utm_source=toutiao&widen=1&tadrequire=true&as={}&cp={}"
    # 定义加密函数(方式一)
    def get_md5(_str):
        md5 = hashlib.md5()
        return md5.hexdigest()
    def get_params():
        t = str(math.floor(time.time()))
        e = hex(int(t))[2:]
        i = get_md5(t).upper()
        if 8 != len(e):
            return {
                "as": "479BB4B7254C150",
                "cp": "7E0AC8874BB0985"
        s = ""
        n = i[:5]
        for r in range(5):
            s += n[r] + e[r]
        l = ""
        a = i[-5:]
        for u in range(5):
            l += e[u+3] + a[u]
        return {
            "as": "A1" + s + e[-3:],
            "cp": e[:3] + l + "E1"
    # 定义加密函数(方式二)

    # 基于以上分析修改 js 文件如下 jsCode.js

    function m (n) {
        function s(t, e) {
            var i = (65535 & t) + (65535 & e),
                n = (t >> 16) + (e >> 16) + (i >> 16);
            return n << 16 | 65535 & i
        function o(t, e) {
            return t << e | t >>> 32 - e
        function r(t, e, i, n, a, r) {
            return s(o(s(s(e, t), s(n, r)), a), i)
        function l(t, e, i, n, a, s, o) {
            return r(e & i | ~e & n, t, e, a, s, o)
        function u(t, e, i, n, a, s, o) {
            return r(e & n | i & ~n, t, e, a, s, o)
        function c(t, e, i, n, a, s, o) {
            return r(e ^ i ^ n, t, e, a, s, o)
        function d(t, e, i, n, a, s, o) {
            return r(i ^ (e | ~n), t, e, a, s, o)
        function h(t, e) {
            t[e >> 5] |= 128 << e % 32,
            t[(e + 64 >>> 9 << 4) + 14] = e;
            var i, n, a, o, r, h = 1732584193,
                _ = -271733879,
                m = -1732584194,
                p = 271733878;
            for (i = 0; i < t.length; i += 16)
            n = h,
            a = _,
            o = m,
            r = p,
            h = l(h, _, m, p, t[i], 7, -680876936),
            p = l(p, h, _, m, t[i + 1], 12, -389564586),
            m = l(m, p, h, _, t[i + 2], 17, 606105819),
            _ = l(_, m, p, h, t[i + 3], 22, -1044525330),
            h = l(h, _, m, p, t[i + 4], 7, -176418897),
            p = l(p, h, _, m, t[i + 5], 12, 1200080426),
            m = l(m, p, h, _, t[i + 6], 17, -1473231341),
            _ = l(_, m, p, h, t[i + 7], 22, -45705983),
            h = l(h, _, m, p, t[i + 8], 7, 1770035416),
            p = l(p, h, _, m, t[i + 9], 12, -1958414417),
            m = l(m, p, h, _, t[i + 10], 17, -42063),
            _ = l(_, m, p, h, t[i + 11], 22, -1990404162),
            h = l(h, _, m, p, t[i + 12], 7, 1804603682),
            p = l(p, h, _, m, t[i + 13], 12, -40341101),
            m = l(m, p, h, _, t[i + 14], 17, -1502002290),
            _ = l(_, m, p, h, t[i + 15], 22, 1236535329),
            h = u(h, _, m, p, t[i + 1], 5, -165796510),
            p = u(p, h, _, m, t[i + 6], 9, -1069501632),
            m = u(m, p, h, _, t[i + 11], 14, 643717713),
            _ = u(_, m, p, h, t[i], 20, -373897302),
            h = u(h, _, m, p, t[i + 5], 5, -701558691),
            p = u(p, h, _, m, t[i + 10], 9, 38016083),
            m = u(m, p, h, _, t[i + 15], 14, -660478335),
            _ = u(_, m, p, h, t[i + 4], 20, -405537848),
            h = u(h, _, m, p, t[i + 9], 5, 568446438),
            p = u(p, h, _, m, t[i + 14], 9, -1019803690),
            m = u(m, p, h, _, t[i + 3], 14, -187363961),
            _ = u(_, m, p, h, t[i + 8], 20, 1163531501),
            h = u(h, _, m, p, t[i + 13], 5, -1444681467),
            p = u(p, h, _, m, t[i + 2], 9, -51403784),
            m = u(m, p, h, _, t[i + 7], 14, 1735328473),
            _ = u(_, m, p, h, t[i + 12], 20, -1926607734),
            h = c(h, _, m, p, t[i + 5], 4, -378558),
            p = c(p, h, _, m, t[i + 8], 11, -2022574463),
            m = c(m, p, h, _, t[i + 11], 16, 1839030562),
            _ = c(_, m, p, h, t[i + 14], 23, -35309556),
            h = c(h, _, m, p, t[i + 1], 4, -1530992060),
            p = c(p, h, _, m, t[i + 4], 11, 1272893353),
            m = c(m, p, h, _, t[i + 7], 16, -155497632),
            _ = c(_, m, p, h, t[i + 10], 23, -1094730640),
            h = c(h, _, m, p, t[i + 13], 4, 681279174),
            p = c(p, h, _, m, t[i], 11, -358537222),
            m = c(m, p, h, _, t[i + 3], 16, -722521979),
            _ = c(_, m, p, h, t[i + 6], 23, 76029189),
            h = c(h, _, m, p, t[i + 9], 4, -640364487),
            p = c(p, h, _, m, t[i + 12], 11, -421815835),
            m = c(m, p, h, _, t[i + 15], 16, 530742520),
            _ = c(_, m, p, h, t[i + 2], 23, -995338651),
            h = d(h, _, m, p, t[i], 6, -198630844),
            p = d(p, h, _, m, t[i + 7], 10, 1126891415),
            m = d(m, p, h, _, t[i + 14], 15, -1416354905),
            _ = d(_, m, p, h, t[i + 5], 21, -57434055),
            h = d(h, _, m, p, t[i + 12], 6, 1700485571),
            p = d(p, h, _, m, t[i + 3], 10, -1894986606),
            m = d(m, p, h, _, t[i + 10], 15, -1051523),
            _ = d(_, m, p, h, t[i + 1], 21, -2054922799),
            h = d(h, _, m, p, t[i + 8], 6, 1873313359),
            p = d(p, h, _, m, t[i + 15], 10, -30611744),
            m = d(m, p, h, _, t[i + 6], 15, -1560198380),
            _ = d(_, m, p, h, t[i + 13], 21, 1309151649),
            h = d(h, _, m, p, t[i + 4], 6, -145523070),
            p = d(p, h, _, m, t[i + 11], 10, -1120210379),
            m = d(m, p, h, _, t[i + 2], 15, 718787259),
            _ = d(_, m, p, h, t[i + 9], 21, -343485551),
            h = s(h, n),
            _ = s(_, a),
            m = s(m, o),
            p = s(p, r);
            return [h, _, m, p]
        function _(t) {
            var e, i = "";
            for (e = 0; e < 32 * t.length; e += 8)
            i += String.fromCharCode(t[e >> 5] >>> e % 32 & 255);
            return i
        function m(t) {
            var e, i = [];
            for (i[(t.length >> 2) - 1] = void 0, e = 0; e < i.length; e += 1)
            i[e] = 0;
            for (e = 0; e < 8 * t.length; e += 8)
            i[e >> 5] |= (255 & t.charCodeAt(e / 8)) << e % 32;
            return i
        function p(t) {
            return _(h(m(t), 8 * t.length))
        function f(t, e) {
            var i, n, a = m(t),
                s = [],
                o = [];
            for (s[15] = o[15] = void 0, a.length > 16 && (a = h(a, 8 * t.length)), i = 0; i < 16; i += 1)
            s[i] = 909522486 ^ a[i],
            o[i] = 1549556828 ^ a[i];
            return n = h(s.concat(m(e)), 512 + 8 * e.length),
            _(h(o.concat(n), 640))
        function g(t) {
            var e, i, n = "0123456789abcdef",
                a = "";
            for (i = 0; i < t.length; i += 1)
            e = t.charCodeAt(i),
            a += n.charAt(e >>> 4 & 15) + n.charAt(15 & e);
            return a
        function v(t) {
            return unescape(encodeURIComponent(t))
        function w(t) {
            return p(v(t))
        function y(t) {
            return g(w(t))
        function b(t, e) {
            return f(v(t), v(e))
        function x(t, e) {
            return g(b(t, e))
        function k(t, e, i) {
            return e ? i ? b(e, t) : x(e, t) : i ? w(t) : y(t)
        return k(n, 0,0);
     function o(s) {
            var e = parseInt(s)
              , t = e.toString(16).toUpperCase()
              , i = m(e).toString().toUpperCase();
            if (8 != t.length)
                return {
                    as: "479BB4B7254C150",
                    cp: "7E0AC8874BB0985"
            for (var n = i.slice(0, 5), o = i.slice(-5), r = "", s = 0; s < 5; s++)
                r += n[s] + t[s];
            for (var l = "", c = 0; c < 5; c++)
                l += t[c + 3] + o[c];
            return {
                as: "A1" + r + t.slice(-3),
                cp: t.slice(0, 3) + l + "E1"

    # 定义加密函数

    def get_params():
        timestamp = str(math.floor(time.time()))
        with open("./jsCode.js", 'r', encoding="utf-8") as f:
            js = f.read()
        result = execjs.compile(js)
        return result.call("o", timestamp)
    # 定义函数生成 xls 文件
    def data2xls(data_list):
        row = 1
        col = 0
        workbook = xlsxwriter.Workbook('{}.xlsx'.format(datetime.date.today()))
        cell_format = workbook.add_format({
            'border': 1,
            'text_wrap': 1
        merge_format = workbook.add_format({
            'bold': True,
            'border': 1,
            'text_wrap': 1
        worksheet = workbook.add_worksheet("首页新闻")
        worksheet.write(0, 0, "chinese_tag", merge_format)
        worksheet.write(0, 1, "media_avatar_url", merge_format)
        worksheet.write(0, 2, "title", merge_format)
        worksheet.write(0, 3, "abstract", merge_format)
        worksheet.write(0, 4, "tag", merge_format)
        worksheet.write(0, 5, "source_url", merge_format)
        worksheet.write(0, 6, "source", merge_format)
        worksheet.write(0, 7, "media_url", merge_format)
        worksheet.set_column(0, 0, 20)
        worksheet.set_column(1, 1, 65)
        worksheet.set_column(2, 2, 70)
        worksheet.set_column(3, 3, 255)
        worksheet.set_column(4, 4, 25)
        worksheet.set_column(5, 5, 30)
        worksheet.set_column(6, 6, 20)
        worksheet.set_column(7, 7, 75)
        for data in data_list:
            chinese_tag = data.get("chinese_tag")
            media_avatar_url = data.get("media_avatar_url")
            title = data.get("title")
            abstract = data.get("abstract")
            tag = data.get("tag")
            source_url = data.get("source_url")
            source = data.get("source")
            media_url = data.get("media_url")
            worksheet.write(row, col, chinese_tag, cell_format)
            worksheet.write(row, col+1, media_avatar_url, cell_format)
            worksheet.write(row, col+2, title, cell_format)
            worksheet.write(row, col+3, abstract, cell_format)
            worksheet.write(row, col+4, tag, cell_format)
            worksheet.write(row, col+5, source_url, cell_format)
            worksheet.write(row, col+6, source, cell_format)
            worksheet.write(row, col+7, media_url, cell_format)
            row += 1
    # 定义 main 函数
    def main(timeparam):
        params_date = get_params()
        new_url = start_url.format(timeparam, params_date["as"], params_date["cp"])
        response = session.get(url=new_url, headers=headers)
        result = response.json()
        data = result["data"]
        next_timestamp = result["next"]
        return {
            "data": data,
            "next_timestamp": next_timestamp
    # 执行 main 函数
    if __name__ == "__main__":
        data_list = []
        for i in range(10):
            if i == 0:
                timeparam = "min_behot_time=0"
            result = main(timeparam)
            data = result["data"]
            timeparam = urlencode(result["next_timestamp"])


