zoukankan      html  css  js  c++  java
  • nodejs 访问网站并操作xpath

    var xpath = require('xpath'); //引用xpath包
    var dom = require('xmldom-silent').DOMParser;//引用xmldom包
    var request=require('request');
    var fs=require('fs');
    var urlencode = require('urlencode');//引用url解码和编码包
    var headers = {
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A403 Safari/8536.25' //设置手机useragent
    };
    
    request(
    {
    url:"https://www.google.co.jp/search?hl=ja&newwindow=1&site=&source=hp&q=hotel&oq=",
    headers:headers
    },function(error,response,body)
    {
    findXpath(body);
    //fileWrite(body);
    console.log("ok");
    });
    
    function findXpath(xml){
    //var xml = "<book><title>Harry Potter</title></book>"
    var doc = new dom().parseFromString(xml)
    var XPATH_CITE = "//div[@id='mbEnd']//ol/li//cite/text()|//div[@id='tads']//ol/li//cite/text()|//div[@id='tadsb']//div[@class='ads-ad']//h3/text()";
    var XPATH_H3 = "//div[@class='ads-ad']//h3//text()";
    var XPATH_ADURL = "//div[@class='ads-ad']/h3/a/@href|//div[@id='tadsb']/ol/li/h3/a/@href";
    var XPATH_INFO = "//div[@id='mbEnd']//ol/li//div[@class='ac ads-creative']//text()|//div[@id='taw']//ol/li//div[contains(@class,'ads-creative')]//text()|//div[@class='ads-ad']//div[@class='ads-creative']//text()";
    var citeNodes = xpath.select(XPATH_CITE, doc);
    var h3Nodes = xpath.select(XPATH_H3, doc);
    var adInfoNodes = xpath.select(XPATH_INFO, doc);
    var adUrlNodes = xpath.select(XPATH_ADURL, doc);
    
    console.log("---------------------Node--------------Info-----------------------");
    
    for(var i=0;i<citeNodes.length;i++)
    {
    var citeTxt = citeNodes[i].nodeValue;//循环获取节点
    var h3Txt =h3Nodes[i].nodeValue;//循环获取节点
    var adUrlTxt = adUrlNodes[i].nodeValue.match(/adurl=(http[S]*$)/)[1].replace("adurl=","");//循环获取节点
    var adInfoTxt =adInfoNodes[i].nodeValue;//循环获取节点
    //var adUrl= UrlDecode(htmlDecode(adUrlTxt));
    var adUrl=urlencode.decode(adUrlTxt);
    console.log(citeTxt);
    console.log(h3Txt);
    console.log(adUrl);
    console.log(adInfoTxt);
    }
    }
    function fileWrite(body)
    {
    fs.writeFile('233.html', body, function (err) {
    if (err) throw err;
    console.log('Saved successfully'); //文件被保存
    });
    }
    

      

  • 相关阅读:
    C#笔记(Hex转JPG)
    rpm 和 yum 软件管理
    名称空间和作用域
    网络技术管理和进程管理
    RAID磁盘阵列
    CentOS7系统启动流程:
    磁盘lvm管理
    面向对象 异常处理
    自定义函数和调用函数 return返回值
    Python常用模块
  • 原文地址:https://www.cnblogs.com/c-x-a/p/5482187.html
Copyright © 2011-2022 走看看