zoukankan      html  css  js  c++  java
  • 爬取今日头条文章

     教程仅供技术研究学习使用,若有侵权,联系本人删除

    以 https://www.toutiao.com/c/user/59672551416/#mid=1566273643580418  为例

    1: 破解as、cp

    使用万能的  alt+F  

    将js代码改写为python代码

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    # Author: zhibo.wang
    # E-mail: gm.zhibo.wang@gmail.com
    # Date  : 20/07/06 11:36:11
    # Desc  :
    
    """
    
    https://s3.pstatp.com/toutiao/resource/ntoutiao_web/static/js/common/lib_6b19209.js
    i = window.byted_acrawler && window.byted_acrawler.sign ? window.byted_acrawler.sign(o) : ""
    
    """
    
    import hashlib
    import math
    import re
    import time
    
    
    def get_as_cp():
        """
        as cp js生成规则
        https://s3.pstatp.com/toutiao/resource/ntoutiao_web/page/profile/index_ae91792.js
        function(i) {
        var e = {};
        e.getHoney = function() {
            var i = Math.floor((new Date).getTime() / 1e3),
            e = i.toString(16).toUpperCase(),
            t = md5(i).toString().toUpperCase();
            if (8 != e.length) return {
                as: "479BB4B7254C150",
                cp: "7E0AC8874BB0985"
            };
            for (var o = t.slice(0, 5), n = t.slice( - 5), a = "", s = 0; 5 > s; s++) a += o[s] + e[s];
            for (var r = "",
            c = 0; 5 > c; c++) r += e[c + 3] + n[c];
            return {
                as: "A1" + a + e.slice( - 3),
                cp: e.slice(0, 3) + r + "E1"
            }
        },
        i.ascp = e
        }
        """
        t = int(math.floor(time.time()))
        e = hex(t).upper()[2:]
        m = hashlib.md5()
        m.update(str(t).encode(encoding='utf-8'))
        i = m.hexdigest().upper()
    
        if len(e) != 8:
            AS = '479BB4B7254C150'
            CP = '7E0AC8874BB0985'
            return AS, CP
    
        n = i[0:5]
        a = i[-5:]
        s = ''
        r = ''
        for o in range(5):
            s += n[o] + e[o]
            r += e[o + 3] + a[o]
    
        AS = 'A1' + s + e[-3:]
        CP = e[0:3] + r + 'E1'
        return AS, CP

    max_behot_time 参数 第一页给 0 后面多页 请给请求后返回数据中的 max_behot_time 值

     

    _signature参数 也是最难处理的

    全局搜索,打断点找到 生成的文件

     

     

     

    https://s3.pstatp.com/toutiao/resource/ntoutiao_web/static/js/common/lib_6b19209.js
    i = window.byted_acrawler && window.byted_acrawler.sign ? window.byted_acrawler.sign(o) : "";

     

    跟着断点一直走发现最终生成的文件是 VM621 也就是下面这张截图

     

    将此文件内容全部拷贝 写入 sign.js中

    const jsdom = require("jsdom");
    const { JSDOM } = jsdom;
    const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
    window = global;
    
    baseurl = process.argv[2]
    cookies = process.argv[3]
    ua = process.argv[4]
    
    var document = dom.window.document;
    var params = {
        location:{
            hash: "#mid=5954781019",
            host: "www.toutiao.com",
            hostname: "www.toutiao.com",
            href: "https://www.toutiao.com/c/user/59672551416/#mid=1566273643580418",
            origin: "https://www.toutiao.com",
            pathname: "/c/user/59672551416/#mid=1566273643580418",
            port: "",
            protocol: "https:",
            search: "",
        },
        navigator:{
            appCodeName: "Mozilla",
            appName: "Netscape",
            appVersion: "5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
            cookieEnabled: true,
            deviceMemory: 8,
            doNotTrack: null,
            hardwareConcurrency: 4,
            language: "zh-CN",
            languages: ["zh-CN", "zh"],
            maxTouchPoints: 0,
            onLine: true,
            platform: "Win32",
            product: "Gecko",
            productSub: "20030107",
            userAgent:  ua,
            vendor: "Google Inc.",
            vendorSub: "",
        },
        "screen":{
            availHeight: 1040,
            availLeft: 0,
            availTop: 0,
            availWidth: 1920,
            colorDepth: 24,
            height: 1080,
            pixelDepth: 24,
             1920,
        }
    };
    Object.assign(window,params);
    
    
    function setCookie(name, value, seconds) {
        seconds = seconds || 0;
        var expires = "";
        if (seconds != 0 ) {
        var date = new Date();
        date.setTime(date.getTime()+(seconds*1000));
        expires = "; expires="+date.toGMTString();
        }
        document.cookie = name+"="+escape(value)+expires+"; path=/";
    }
    
    //cookies = "csrftoken=a6f078a275e9f39b0addfb9df37fd890; tt_webid=6856639657595241992; s_v_web_id=verify_kde4odjw_isOUm41W_VbRS_4WS0_BrtZ_Ch1KLo5pkNV5;tasessionId=ownu4mas91596435834562; ttcid=1de8f696daab43dc8eb818a02408bd6930; tt_scid=P.PuhA.5OslBUeRVIYUAYFS--vw9l9LTWpc4-b4r7prsBwQ2X6extVf1PCjkhCNWc102"
    for(let cookie of cookies.split(";")){
        tmp = cookie.split("=");
        setCookie(tmp[0],tmp[1],1800);
    }
    window.document = document;
    
    
    //将拷贝的内容放在这里
    
    
    window.byted_acrawler && window.byted_acrawler.init({
        aid: 24,
        dfp: true,
      })
    
    
    
    
    //sign = window.byted_acrawler.sign({url:"https://www.toutiao.com/api/pc/media_hot/?media_id=1566273643580418&user_id=59672551416"});
    sign = window.byted_acrawler.sign({url:baseurl});
    console.log(sign);

    请安装jsdom     npm i -g jsdom

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    
    import os
    import time
    import math
    import hashlib
    import requests
    
    def getHoney():
        i = math.floor(time.time())
        e = str('%X' % i)
        md5 = hashlib.md5()
        md5.update(str(i).encode('utf-8'))
        t = str(md5.hexdigest()).upper()
        if 8 != len(e):
            return {
                'as':"479BB4B7254C150",
                'cp':"7E0AC8874BB0985"
            }
        o = t[0:5]
        n = t[-5:]
        a = ''
        r = ''
        for i in range(5):
            a += o[i] + e[i]
            r += e[i + 3] + n[i]
        return {
            'as':"A1" + a + e[-3:],
            'cp':e[0:3] + r + "E1"
        }
    
    def get_signature(url, cookies, ua):
        sign = os.popen('node sign.js {url} {cookies} {ua}'.format(
            url=url,
            cookies=cookies,
            ua=ua)
        ).read()
        return "&_signature=" + sign
    
    if __name__ == '__main__':
        headers = {
            'Referer':'https://www.toutiao.com/',
            'authority': 'www.toutiao.com',
            'method': 'GET',
            'path': '/c/user/59672551416/',
            'scheme': 'https',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',
            'cookie': 'csrftoken=a6f078a275e9f39b0addfb9df37fd890; s_v_web_id=verify_kde4odjw_isOUm41W_VbRS_4WS0_BrtZ_Ch1KLo5pkNV5; ttcid=1de8f696daab43dc8eb818a02408bd6930; SLARDAR_WEB_ID=c7f55d5c-4dba-493d-a126-ce8e36b472bf; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6856656984092460551; tt_webid=6856656984092460551; __tasessionId=61hz1rirw1596442527425; tt_scid=UD3a5jP-6nL7yUaAawB2lLtCdtv430T-TJyynultVAGY6J4cY6KXTiH1QRWAYhb9e1f5',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'none',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
        }
        base_url = 'https://www.toutiao.com/toutiao'
        param = '/c/user/article/?page_type=1&user_id=59672551416&max_behot_time=0&count=20&as={as}&cp={cp}'.format(**getHoney())
        base_url += param
        signature = get_signature(
            base_url,
            headers["cookie"],
            headers["user-agent"]
        )
        path = param + signature
        headers['path'] = path
        url = base_url + signature
        print(url)
        response = requests.get(url=url,headers=headers)
        print(response.text)

    python test.py

     

  • 相关阅读:
    NX二次开发-UFUN设置显示状态抑制显示UF_DISP_set_display
    NX二次开发-使用NXOPEN C++向导模板做二次开发
    ANTV/G6 怎么按条件自定义节点颜色(Graphin)
    js数组去重及数组对象去重
    vue组件老胡机抽奖(转载)
    Kafka第二节
    Kafka第一节
    更改idea的database数据库连接的ddl格式
    不推荐别的了,IDEA 自带的数据库工具就很牛逼!
    MongoDB 按照时间段查询某个物理机的CPU使用率,按照时间倒序排序,取出最新的5条数据
  • 原文地址:https://www.cnblogs.com/dockers/p/13427344.html
Copyright © 2011-2022 走看看