zoukankan      html  css  js  c++  java
  • nodejs 下载网页及相关资源文件

    功能其实很见简单,通过 phantomjs.exe 采集 url 加载的资源,通过子进程的方式,启动nodejs 加载所有的资源,对于css的资源,匹配css内容,下载里面的url资源

    当然功能还是很简单的,在响应式设计和异步加载的情况下,还是有很多资源没有能够下载,需要根据实际情况处理下

     首先当然是下载 nodejs 和 phantomjs

    下面是 phantomjs.exe 执行的 down.js

    var page = require('webpage').create(),
        system = require('system');
    var spawn = require("child_process").spawn
    
    if (system.args.length === 1) {
        console.log('Usage: netsniff.js <some URL>');
        phantom.exit(1);
    } else {
        var urls = [];
        page.address = system.args[1];
        page.onResourceReceived = function (res) {
            if (res.stage === 'start') {
                urls.push(res.url);
            }
        };
        page.open(page.address, function (status) {
            var har;
            if (status !== 'success') {
                console.log('FAIL to load the address');
                phantom.exit(1);
            } else {
                console.log('down resource ' + urls.length + ' urls.');
                var child = spawn("node", ["--harmony", "downHtml.js", urls.join(',')])
                child.stdout.on("data", function (data) {
                  console.log(data);
                })
                child.stderr.on("data", function (data) {
                  console.log(data);
                })
                child.on("exit", function (code) {
                  phantom.exit();
                })            
            }
        });
    }

    下面是对应的node运行的 downHtml.js

    "use strict";
    var fs = require('fs');
    var http = require('http');
    var path = require('path');
    var r_url = require('url');
    
    var dirCache = {};//缓存减少判断
    function makedir (pathStr, callback) {
        if (dirCache[pathStr] == 1) {
            callback();
        } else {
            fs.exists(pathStr, function (exists) {
                if (exists == true) {
                    dirCache[pathStr] == 1;
                    callback();
                } else {
                    makedir(path.dirname(pathStr), function () {
                        fs.mkdir(pathStr, function () {
                            dirCache[pathStr] == 1;
                            callback();
                        })
                    });
                }
            })
        }
    };
    
    var reg = /[:,]s*url(['"]?.*?(1))/g
    var reg2 = /((['"]?)(.*?)(1))/
    var isDownMap = {};
    var downImgFromCss = function (URL) {
        http.get(URL, function(res) {
            //console.log(path.resolve(process.cwd(), 'index.min.css'))
            //res.pipe(fs.createWriteStream(path.resolve(process.cwd(), 'index.min.css')));
            var body = "";
            res.setEncoding('utf8');
            res.on('data', function (chunk) {
                body += chunk;
            });
            res.on('end', function () {
                var match = body.match(reg);
                for (var i = 0, len = match.length; i < len; i++){
                    var m = match[i].match(reg2);
                    if (m && m[2]) {
                        var url = m[2];
                        let imgUrl = r_url.resolve(URL, url);
                        if (!isDownMap[imgUrl]) {
                            var uo = r_url.parse(imgUrl);
                            let filepath = CWD + '/' + uo.hostname + uo.pathname;
                            makedir(path.dirname(filepath), function () {
                                http.get(imgUrl, function (res) {
                                    res.pipe(fs.createWriteStream(filepath));
                                })
                            })
                            isDownMap[imgUrl] = 1;
                        }
                    }
                }
            });
        });
    }
    
    var URLS = process.argv[2].split(',');
    var CWD = process.cwd();
    //下载资源
    URLS.forEach(function (URL) {
        var uo = r_url.parse(URL);
        var filepath;
        if (uo.pathname == '/' || uo.pathname == '') {
            filepath = CWD + '/' + uo.hostname + '/index.html';
        } else {
            filepath = CWD + '/' + uo.hostname + uo.pathname;
        }
        makedir(path.dirname(filepath), function () {
            http.get(URL, function (res) {
                if (URL.indexOf('.css') != -1 || (res.headers["content-type"] && res.headers["content-type"].indexOf('text/css')!= -1)) {
                    console.log('down images form css file:' + URL + '.');
                    downImgFromCss(URL);
                }
                res.pipe(fs.createWriteStream(filepath));
            })
        });
    });

    down.js downHtml.js 放在同一个文件夹下 通过下列 cmd 运行

    D:phantomjs-2.0.0-windowsinphantomjs.exe down.js http://www.youku.com/

  • 相关阅读:
    PHP include寻找文件规则
    go实现聊天系统(三)
    go实现聊天系统(二)
    题解 UVA10298 【Power Strings】
    单源最短路SPFA
    css面试题
    【笔记】随笔记录
    【form】表单提交方式
    【CSS】常用css
    【Appcan】常用随笔
  • 原文地址:https://www.cnblogs.com/legu/p/4473462.html
Copyright © 2011-2022 走看看