Jsoup.parse解析HTML字符串,如Jsoup.parse("<html><head><title>Firstparse</title></head>") Jsoup.connect解析url网站地址,如Jsoup.connect(http://www.baidu.com).get()
可以用httpclient获取网页,再用Jsoup.parse解析页面
String text = getMethod.getResponseBodyAsString();
Document doc = Jsoup.parse(text);
法一:httpclient+Jsoup
1 String dataUrl = "http://hi.mop.com/?"; 2 HttpClient httpClient = new HttpClient(); 3 String cookies = "_ml=371386500452711504675;"; 4 GetMethod getMethod = new GetMethod(dataUrl); 5 // 每次访问需授权的网址时需带上前面的 cookie 作为通行证 6 getMethod.setRequestHeader("cookie", cookies); 7 getMethod.setRequestHeader("Referer", "http://passport.mop.com/"); 8 getMethod.setRequestHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"); 9 int statusCodes = httpClient.executeMethod(getMethod); 10 System.out.println(statusCodes); 11 System.out.println("模拟登录成功"); 12 String text = getMethod.getResponseBodyAsString();
法二:Jsoup Connction + Jsoup解析
Connection conn = Jsoup.connect(String url); conn.data("txtBill", key);// 设置关键字查询字段 Document doc = null; doc = conn.timeout(100000).post();//设置请求类型为post型或者get型,超时100000毫秒 results = doc.select(TagName);// 处理返回数据 for (Element result : results){}
package Step1; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import javax.lang.model.element.Element; import org.apache.commons.httpclient.Cookie; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.NameValuePair; import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.PostMethod; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; /** * * @ClassName: HttpLogin * @Description: java通过httpclient结合Jsoup * @author zeze * @date 2015年11月10日 下午5:07:33 * */ public class HttpLogin { public static void main(String[] args) { //Url String dataUrl = "http://hi.mop.com/?"; HttpClient httpClient = new HttpClient(); try { // 获得登陆后的 Cookie String cookies = "_ml=371386500452711504675;"; GetMethod getMethod = new GetMethod(dataUrl); // 每次访问需授权的网址时需带上前面的 cookie 作为通行证 getMethod.setRequestHeader("cookie", cookies); getMethod.setRequestHeader("Referer", "http://passport.mop.com/"); getMethod.setRequestHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"); int statusCodes = httpClient.executeMethod(getMethod); System.out.println("状态码:"+statusCodes); //如果返回的头信息没有指定长度或长度大于1M则抛出异常需把 getResponseBodyAsString()换成 getResponseBodyAsStream() //String text = getMethod.getResponseBodyAsString(); InputStream inputStream = getMethod.getResponseBodyAsStream(); BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)); StringBuffer stringBuffer = new StringBuffer(); String str= ""; while((str = br.readLine()) != null){ stringBuffer.append(str ); } //Element result=text; Document doc = Jsoup.parse(stringBuffer.toString()); Elements name=doc.select("[class=tc c068 fs14 yahei mt5 username]"); System.out.println(name.text()); } catch (Exception e) { e.printStackTrace(); } } }