zoukankan      html  css  js  c++  java
  • java+phantomjs实现动态网页抓取

    1.下载地址:http://phantomjs.org/download.html

    2.java代码

    public     void   getHtml(String url)
    {
    	 HTML="";
        String jsPath = "C:\phantomjs\examples\myjs.js";
        String exePath = "C:\phantomjs\bin\phantomjs.exe";
        System.out.println(jsPath);
        System.out.println(exePath);
        Runtime rt = Runtime.getRuntime();
    	Process p;
    	try {
    		p = rt.exec(exePath + " " + jsPath + " " + url);
    
    	InputStream is = p.getInputStream();
    	BufferedReader br = new BufferedReader(new InputStreamReader(is));
    	StringBuffer sbf = new StringBuffer();
    	String tmp = "";
    	while ((tmp = br.readLine()) != null)
    	{
    		sbf.append(tmp);
    	}
    	HTML=sbf.toString();
     
      is.close();
      br.close();
      sbf=null;
      is=null;
      br=null;
    	} catch (IOException e) {
    	 
    		e.printStackTrace();
    	}
     
    }
    

     3.js

       

    var page = require('webpage').create(),
      system = require('system'),
      t, address;
    
    page.settings.loadImages = false;  //为了提升加载速度,不加载图片
    page.settings.resourceTimeout = 10000;//超过10秒放弃加载
    //此处是用来设置截图的参数。不截图没啥用
    page.viewportSize = {
       1280,
      height: 800
    };
    block_urls = ['baidu.com'];//为了提升速度,屏蔽一些需要时间长的。比如百度广告
    page.onResourceRequested = function(requestData, request){
        for(url in block_urls) {
            if(requestData.url.indexOf(block_urls[url]) !== -1) {
                request.abort();
               return;
            }
        }            
    }
     
    address = system.args[1];
    page.open(address, function(status) {
      if (status !== 'success') {
        console.log('FAIL to load the address');
      } else {
     
        console.log(page.content);
        setTimeout(function(){ phantom.exit(); }, 6000);
      }
      phantom.exit();
    });

     

  • 相关阅读:
    golang API开发过程的中的自动重启(基于gin框架)
    单位时间的调度问题 —— 贪心
    Qt获取文件夹下文件
    C++ using
    QGridLayout动态添加控件
    数据库查询优化-20条必备sql优化技巧
    Django:类视图的装饰器
    使用同一个模态框进行新增和修改
    Django:使用celery处理异步任务
    jenkins:调用jenkinsAPI
  • 原文地址:https://www.cnblogs.com/xiaoliao/p/10075714.html
Copyright © 2011-2022 走看看