加入 jsoup 和 htmlunit 的依赖
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency> <dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.25</version> </dependency>
代码:
package com.jm.bigdata.util;
import java.io.IOException;
import java.util.logging.Level;
import org.apache.htrace.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class ReptileTools {
/**
* 使用htmlunit模拟Chrome并获取全部网页信息
* @param phoneNumber
* @return
*/
public static String searchMobile2(String cookie,String DownloadUrl) {
String title="";
Document doc = null;
try {
//构造一个webClient 模拟Chrome 浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
//屏蔽日志信息
LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");
java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
//支持JavaScript
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setTimeout(5000);
HtmlPage rootPage = webClient.getPage(DownloadUrl);
//设置一个运行JavaScript的时间
webClient.waitForBackgroundJavaScript(5000);
String html = rootPage.asXml();
doc = Jsoup.parse(html);
System.out.println(doc);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
return null;
}
return title;
}
}
这样我们就可以得到一个包含运行 JavaScript 之后的完整源网页了