zoukankan      html  css  js  c++  java
  • 批量导出某个简书用户的所有文章列表和文章超链接

    简书改版后,根据文章标题搜索文章的功能就不见了。

    虽然简书提供了批量下载文章的功能,但是下载到本地的文章都是markdown格式的,不包含文章的链接,这不满足我的需求。

    既然我是程序员,没有这个功能我就自己实现一个。

    打开简书首页,发现默认只显示8篇文章,用鼠标滑动到屏幕底部后,会触发一个懒加载事件,到后台读取更多的文章列表,所以文章读取在服务器端是采取的分页实现。

    打开Chrome开发者工具,观察网络请求,请求url中99b8712e8850是我简书用户id,page=2,3,4这些是分页代码。

    每页的文章内容以html格式包含在响应结构里:

    我关心的只是文章标题和文章链接,如上图高亮字段所示。

    最开始我写了一个nodejs应用,代码如下:

    var request = require('request');
    var jsdom = require("jsdom");
    var JSDOM = jsdom.JSDOM;
    const PREFIX = "https://www.jianshu.com";
    const PAGE = "https://www.jianshu.com/u/99b8712e8850?order_by=shared_at&page=";
    const MAX = 2;
    
    var mArticleResult = new Map();
    var pageNumber;
    /* a given article: https://www.jianshu.com/p/963cd23fb092
      value got from API: /p/5c1d0319dc42
    */
    var lastPageReached = false;
    var url = "";
    
    var aHandlers = [];
    
    // use limited for loop to ease testing
    for(var i = 0; i < MAX; i++){
      pageNumber = i + 1;
      var url = PAGE + pageNumber;
      // console.log("current page: " + url);
      var pageOptions = {
            url: url,
            method: "GET",
            headers: {
                "Accept": "text/html"
            }
      };
      aHandlers.push(getArticles(pageOptions, pageNumber));
      if( lastPageReached)
        break;
    }
    
    console.log("promise handler size: " + aHandlers.length);
    
    Promise.all(aHandlers).then(function(){
      var articleIndex = 0;
      for (var [key, value] of mArticleResult) {
        console.log("Article[" + articleIndex++ + "]: " + key + " = " + value);
      }
      console.log("done");
    }
      );
    
    function getArticles(pageOptions, pageNumber) {
      return new Promise(function(resolve,reject){
          var requestC = request.defaults({jar: true});
    
          requestC(pageOptions,function(error,response,body){
            if( error){
              console.log("error: " + error);
              resolve(error);
            }
            var document = new JSDOM(body).window.document;
            var content = document.getElementsByTagName("li");
    
            for( var i =0; i < content.length; i++){
              var li = content[i];
              var children = li.childNodes;
              for( var j = 0; j < children.length; j++){
                  var eachChild = children[j];
                  if( eachChild.nodeName == "DIV"){
                    var grandChild = eachChild.childNodes;
                    for( var k = 0; k < grandChild.length; k++){
                      var grand = grandChild[k];
                      if( grand.nodeName == "A"){
                        var fragment = grand.getAttribute("href");
                        if( fragment.indexOf("/p") < 0)
                          continue;
                        console.log("title: " + grand.text);
                        var wholeURL = PREFIX + fragment;
                        console.log("url: " + wholeURL);
                        if( mArticleResult.has(grand.text)){
                          lastPageReached = true;
                          console.log("article size: " + mArticleResult.size);
                          resolve(pageNumber);
                        }
                        mArticleResult.set(grand.text, wholeURL);
                      }
                    }
                  }
              }
            }// end of outer loop
            resolve(pageNumber);
          }); 
         });
    }
    

    原理就是使用nodejs的request module,向简书网站同时发起多个请求,每个请求读取一页的简书文章。

    后来发现这种方法在并发请求数大于10个的时候就无法工作,简书网站会拒绝该类请求,返回HTTP 429状态码。

    所以最后我采用了最简单的同步请求实现,使用了nodejs提供的sync-request在循环里发起请求。

    var request = require("sync-request");
    var jsdom = require("jsdom");
    var JSDOM = jsdom.JSDOM;
    var textEncoding = require('text-encoding'); 
    var textDecoder = textEncoding.TextDecoder;
    
    const PREFIX = "https://www.jianshu.com";
    const PAGE = "https://www.jianshu.com/u/99b8712e8850?order_by=shared_at&page=";
    const MAX = 100;
    
    var mArticleResult = new Map();
    var lastPageReached = false;
    var pageNumber;
    /* a given article: https://www.jianshu.com/p/963cd23fb092
      value got from API: /p/5c1d0319dc42
    */
    
    try {
        // use limited for loop to ease testing
        for (var i = 0; i < MAX; i++) {
            if( lastPageReached)
              break;
            pageNumber = i + 1;
            var url = PAGE + pageNumber;
            console.log("current page: " + url);
            var response = request('GET', url);
            var html = new textDecoder("utf-8").decode(response.body);
            handleResponseHTML(html);
        }
    } 
    catch (e) {
    
    }
    
    var articleIndex = 0;
    var resultHTML = "<html>";
    
    const fs = require('fs');
    
    /*
    <HTML>
    <p>
    <a href="https://www.baidu.com">eee</a>
    </p>
    
    <p><a>22</a></p>
    <p><a>33</a></p>
    </HTML>
    */
    
    var index = 1;
    for (var [key, value] of mArticleResult) {
        var article = "<p><a href="" + key + "">" + 
        index++ + ". " + value + "</a></p>" + "
    ";
        resultHTML = resultHTML + article;
        console.log("Article[" + articleIndex++ + "]: " + value + " = " + key);
    }
    
    resultHTML = resultHTML + "</html>";
    
    var pwd = process.cwd() + "/jianshu.html";
    
    fs.appendFileSync(pwd, resultHTML);
    
    console.log("done");
    
    
    
    function handleResponseHTML(html) {
        var document = new JSDOM(html).window.document;
        var content = document.getElementsByTagName("li");
    
        for (var i = 0; i < content.length; i++) {
            var li = content[i];
            var children = li.childNodes;
            for (var j = 0; j < children.length; j++) {
                var eachChild = children[j];
                if (eachChild.nodeName == "DIV") {
                    var grandChild = eachChild.childNodes;
                    for (var k = 0; k < grandChild.length; k++) {
                        var grand = grandChild[k];
                        if (grand.nodeName == "A") {
                            var fragment = grand.getAttribute("href");
                            if (fragment.indexOf("/p") < 0)
                                continue;
                            // console.log("title: " + grand.text);
                            var wholeURL = PREFIX + fragment;
                            // console.log("url: " + wholeURL);
                            if (mArticleResult.has(wholeURL)) {
                                lastPageReached = true;
                                console.log("article size: " + mArticleResult.size);
                                return;
                            }
                            mArticleResult.set(wholeURL, grand.text);
                        }
                    }
                }
            }
        }
    }
    

    这个nodejs应用执行后,会在本地生成一个html文件,包含每篇文章的标题和超链接。

    要获取更多Jerry的原创文章,请关注公众号"汪子熙":

  • 相关阅读:
    Swift扩展(Extension)
    Swift构造器(Initializer)与析构器(Deinitializer)
    Swift下标
    Swift方法
    Swift属性
    Swift类与结构体
    Swift闭包(Closure)
    python调用c++/c 共享库,开发板上编译的一些坑!
    python调用c++类方法(2)
    ubuntu 18.04 gcc g++降级4.8版
  • 原文地址:https://www.cnblogs.com/sap-jerry/p/10841325.html
Copyright © 2011-2022 走看看