zoukankan      html  css  js  c++  java
  • NODE学习:利用nodeJS去抓网页的信息

    1:引用模块"http" (执行命令node app.js "http://www.baidu.com")

    //app.js
    var http = require('http'); var url = require('url'); function spider(u,cb){ http.get( url.parse(u), function(res){ var d = '' res.on('data',function(chunk){ d += chunk; }) res.on('end',function(){ console.log('spider_end && do cb'); cb(d); }) }); }; var u = ""; if( require.main === module ) { u = process.argv[2] }; spider(u,function( data ){
      //这个返回的是网页内容的信息; console.log( data ); });

    2:引用模块nodegrass: (执行命令node app.js "http://www.baidu.com")

    //app.js
    var url = "http://www.cnblogs.com/xiaochao12345/archive/2014/10/23/4044950.html"; var ng = require('nodegrass'); if(require.main === module) { console.log( process.argv ); url = process.argv[2]; } ng.get(url,function(data){
      //这个返回的是网页内容的信息;
    console.log(data); },'utf8');

    3:引用模块superagent: (执行命令node app.js "http://www.baidu.com")

    //app.js
    var url = "http://www.cnblogs.com/xiaochao12345/archive/2014/10/23/4044950.html"; var superagent = require("superagent"); if(require.main === module) { console.log( process.argv ); url = process.argv[2]; }; superagent.get(url) .end(function (err, res) { console.log('fetch successful'); console.log(res); });

    4:使用curl模块:(执行命令node app.js "http://www.baidu.com")

    //app.js
    var curl = require("curl"); var u = ""; if( require.main === module ) { u = process.argv[2] }; curl.get(u,function( data ){ console.log( arguments ); });

    ______________________________________________________________________________________________________________

    NODEJS中处理dom节点的方式(接口都和jq是统一的);

    1:引用cheerio

    var cheerio = require("cheerio");
    
    var html = "<html><body><div id="div1">text</div></body></html>";
    
    var $ = cheerio.load(html);
    
    console.log( "html" );
    console.log( $.html() );
    
    console.log( "#div1————〉html" );
    console.log( $("#div1").html() );
    
    console.log( "#div1----〉text" );
    console.log( $("#div1").text() );

    2:引用jquery

    var $ = require("jquery");
    
    var $dom = $("<html><body><div id="div1">text</div></body></html>");
    
    console.log( $dom.find("#div1").text() );

    3:引用jsdom

    var jsdom = require('jsdom');
    var curl = require("curl");
    var u = "https://github.com";
    
    if( require.main === module ) {
        u = process.argv[2];
    };
    
    curl.get(u,function( arg0 , html ){
         //jsdom相当于打开了一个页面,在这个页面里面运行js;
        var document = jsdom.jsdom( html.body );
        for(var a in html)
            console.log(a);
    
        var window = document.createWindow();
        var script = document.createElement('script');
    
        //script.src = 'http://code.jquery.com/jquery-1.4.2.js';
        script.src = "http://127.0.0.1:81/js/jquery.min.js";
        script.onload = function(){
            console.log(1);
            console.log( window.jQuery("body").text() );
        };
    
        document.head.appendChild( script );
    });

    _________________________________________________________________________________________________________________

    文件保存的话直接用nodeJS中的fs模块:

    var fs = require("fs");
    
    fs.appendFile('file-name', "text_text_text_text" ,function(err){
        if(err)throw err;
        console.log('done')
    });
  • 相关阅读:
    Linux删除tunnel的方法
    rpm 强制更新安装
    普通用户使用kubectl
    网络通信过程中mac地址和ip地址的必要性
    Quartz.net开源作业调度框架使用详解
    C# 开源组件--Word操作组件DocX
    用c#创建支持多语言的WinForm应用程序
    使用JavaScript获取日期加随机数生成单号
    C# winform treeView checkbox全选反选
    DevExpress控件的GridControl控件小结
  • 原文地址:https://www.cnblogs.com/diligenceday/p/4047816.html
Copyright © 2011-2022 走看看