一、HttpClient介绍
虽然在 JDK 的 java.net 包中已经提供了访问 HTTP 协议的基本功能,但是它没有提供足够的灵活性和其他应用程序需要的功能。HttpClient 是 Apache Jakarta Common 下的子项目,用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包,并且它支持 HTTP 协议最新的版本和建议。
二、使用范例
4.3版本
1 ,通过get方式,请求网页内容。我们首先创建httpclient对象,然后通过httpclient来执行http get方法,httpresponse获得服务端响应的所有内容,httpentity为获取的网页消息体。
CloseableHttpClient httpclient = HttpClients.createDefault(); try { // 以get方法执行请求 HttpGet httpGet = new HttpGet(“http://localhost/”); // 获得服务器响应的所有信息 CloseableHttpResponse responseGet = httpclient.execute(httpGet); try { System.out.println(responseGet.getStatusLine()); // 获得服务器响应的消息体(不包括http head) HttpEntity entity = responseGet.getEntity(); if (entity != null) { // 获得响应字符集编码 ContentType contentType = ContentType.getOrDefault(entity); Charset charset = contentType.getCharset(); InputStream is = entity.getContent(); // 将inputstream转化为reader,并使用缓冲读取,还可按行读取内容 BufferedReader br = new BufferedReader( new InputStreamReader(is, charset)); String line = null; while ((line = br.readLine()) != null) { System.out.println(line); } is.close(); } } finally { responseGet.close(); } } finally { httpclient.close(); }
2 ,通过post方式提交表单。浏览器可将登录后的会话信息存储到本地,登陆之后的每次请求都会自动向服务器发送cookie信息,幸好的是httpclient亦可自动处理cookie信息。
CloseableHttpClient httpclient = HttpClients.createDefault(); // 以post方法发起登录请求 String urlString = "http://localhost/llogin.do"; HttpPost httpPost = new HttpPost(urlString); List<NameValuePair> nvps = new ArrayList<NameValuePair>(); nvps.add(new BasicNameValuePair("username", "admin")); nvps.add(new BasicNameValuePair("password", "admin")); // 添加post参数 httpPost.setEntity(new UrlEncodedFormEntity(nvps)); CloseableHttpResponse response = httpclient.execute(httpPost); try { // 状态302的话,重定向,则无法获取响应消息体 System.out.println(response.getStatusLine()); // 获得服务器响应的消息体(不包括http head) HttpEntity entity = response.getEntity(); if (entity != null) { // 获得响应字符集编码 ContentType contentType = ContentType.getOrDefault(entity); Charset charset = contentType.getCharset(); InputStream is = entity.getContent(); // 将inputstream转化为reader,并使用缓冲读取,还可按行读取内容 BufferedReader br = new BufferedReader( new InputStreamReader(is, charset)); String line = null; while ((line = br.readLine()) != null) { System.out.println(line); } is.close(); } } finally { response.close(); }
3 ,重定向。httpclient默认可自动处理重定向请求,但是post方式需另外设置。
LaxRedirectStrategy redirectStrategy = new LaxRedirectStrategy(); CloseableHttpClient httpclient = HttpClients.custom() .setRedirectStrategy(redirectStrategy) .build(); HttpClientContext context = HttpClientContext.create(); try { // 以post方法执行登录请求 HttpPost httpPost = new HttpPost(urlString); List<NameValuePair> nvps = new ArrayList<NameValuePair>(); nvps.add(new BasicNameValuePair("username", "admin")); nvps.add(new BasicNameValuePair("password", "admin")); // 添加post参数 httpPost.setEntity(new UrlEncodedFormEntity(nvps)); CloseableHttpResponse response = httpclient.execute(httpPost, context); try { // 状态302的话,重定向,则无法获取响应消息体 System.out.println(response.getStatusLine()); // 获得服务器响应的消息体(不包括http head) HttpEntity entity = response.getEntity(); //输出最终访问地址 HttpHost targetHost = context.getTargetHost(); System.out.println(targetHost); List<URI> redirecLocations = context.getRedirectLocations(); URI location = URIUtils.resolve(httpPost.getURI(), targetHost, redirecLocations); System.out.println("Final HTTP location: " + location.toASCIIString()); if (entity != null) { // 获得响应字符集编码 ContentType contentType = ContentType.getOrDefault(entity); Charset charset = contentType.getCharset(); InputStream is = entity.getContent(); // 将inputstream转化为reader,并使用缓冲读取,还可按行读取内容 BufferedReader br = new BufferedReader( new InputStreamReader(is, charset)); String line = null; while ((line = br.readLine()) != null) { System.out.println(line); } is.close(); } } finally { response.close(); } } finally { httpclient.close(); }
三、 网页抓取(web蜘蛛)实例 抓取腾讯新闻头条
import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.SocketTimeoutException; import java.util.ArrayList; import java.util.Collection; import java.util.zip.GZIPInputStream; import javax.net.ssl.SSLHandshakeException; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.ParseException; import org.apache.http.client.methods.HttpGet; import org.apache.http.conn.params.ConnRoutePNames; import org.apache.http.entity.ContentType; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.params.HttpParams; public class httpTest { private static DefaultHttpClient httpClient; static { httpClient = new DefaultHttpClient(); } public String getHtml(String url) { HttpHost proxyHost = new HttpHost("211.142.236.137", 8080);// 代理 String html = getHtml(url, proxyHost); int count = 0; while ("".equals(html) || html == null) { proxyHost = new HttpHost("211.142.236.137", 80);// 更换代理 html = getHtml(url, proxyHost); count++; if (count > 3) { System.out.println("抓取失败"); return null; } } return html; } public String getHtml(String url, HttpHost proxyHost) { String html = ""; HttpGet get = new HttpGet(url); get.addHeader("Accept", "text/html, application/xhtml+xml, */*"); get.addHeader("Accept-Language", "zh-CN,en-US;q=0.5"); get.addHeader("User-Agent", "Mozilla/27.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"); get.addHeader("Accept-Encoding", "gzip, deflate, sdch"); get.addHeader("Connection", "Keep-alive"); // HttpParams setParameter = get.getParams().setParameter( // ConnRoutePNames.DEFAULT_PROXY, proxyHost);// 设置代理 HttpResponse httpResponse; HttpEntity httpEntity; try { httpResponse = httpClient.execute(get); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (200 != statusCode) return html; httpEntity = httpResponse.getEntity(); if (httpEntity != null) html = readHtmlContentFromEntity(httpEntity); } catch (Exception e) { e.printStackTrace(); } finally { if (get != null) get.releaseConnection(); } return html; } private String readHtmlContentFromEntity(HttpEntity httpEntity) throws ParseException, IOException { String html = ""; Header header = httpEntity.getContentEncoding(); InputStream in = httpEntity.getContent(); if (header != null && "gzip".equals(header.getValue())) { html = unZip(in, ContentType.getOrDefault(httpEntity).getCharset() .toString()); } else { html = readInStreamToString(in, ContentType .getOrDefault(httpEntity).getCharset().toString()); } if (in != null) { in.close(); } return html; } private String unZip(InputStream in, String charSet) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); GZIPInputStream gis = null; try { gis = new GZIPInputStream(in); byte[] _byte = new byte[1024]; int len = 0; while ((len = gis.read(_byte)) != -1) { baos.write(_byte, 0, len); } String unzipString = new String(baos.toByteArray(), charSet); return unzipString; } finally { if (gis != null) { gis.close(); } if (baos != null) { baos.close(); } } } private String readInStreamToString(InputStream in, String charSet) throws IOException { StringBuilder str = new StringBuilder(); String line; BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(in, charSet)); while ((line = bufferedReader.readLine()) != null) { str.append(line); str.append(" "); } if (bufferedReader != null) { bufferedReader.close(); } return str.toString(); } }
主函数
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class GetNews { public static void main(String[] argu) { httpTest httpConnectionManager = new httpTest(); String html = httpConnectionManager.getHtml("http://www.qq.com"); if (html != null && !html.equals("")) { Document doc = Jsoup.parse(html); Elements newsList = doc.select("div[class=ft fl]").select("li").select("a"); for (Element element : newsList) { System.out.println(element.text() + " 链接" + element.attr("href")); } } } }
实例见: http://www.cnblogs.com/updateofsimon/category/550506.html .
四、 文件上传
文件上传可以使用两种方式实现,一种是PostMethod方式,一种是HttpPost方式。两者的处理大同小异。PostMethod是使用FileBody将文件包装流包装起来,HttpPost是使用FilePart将文件流包装起来。在传递文件流给服务端的时候,都可以同时传递其他的参数。
客户端处理
HttpPost方式
这种方式,与上面类似,只不过变成了FileBody。上面的Part数组在这里对应HttpEntity。此处的HttpClient是org.apache.http.client.methods下的。
1 public void upload(String localFile){
2 CloseableHttpClient httpClient = null;
3 CloseableHttpResponse response = null;
4 try {
5 httpClient = HttpClients.createDefault();
6
7 // 把一个普通参数和文件上传给下面这个地址 是一个servlet
8 HttpPost httpPost = new HttpPost(URL_STR);
9
10 // 把文件转换成流对象FileBody
11 FileBody bin = new FileBody(new File(localFile));
12
13 StringBody userName = new StringBody("Scott", ContentType.create(
14 "text/plain", Consts.UTF_8));
15 StringBody password = new StringBody("123456", ContentType.create(
16 "text/plain", Consts.UTF_8));
17
18 HttpEntity reqEntity = MultipartEntityBuilder.create()
19 // 相当于<input type="file" name="file"/>
20 .addPart("file", bin)
21
22 // 相当于<input type="text" name="userName" value=userName>
23 .addPart("userName", userName)
24 .addPart("pass", password)
25 .build();
26
27 httpPost.setEntity(reqEntity);
28
29 // 发起请求 并返回请求的响应
30 response = httpClient.execute(httpPost);
31
32 System.out.println("The response value of token:" + response.getFirstHeader("token"));
33
34 // 获取响应对象
35 HttpEntity resEntity = response.getEntity();
36 if (resEntity != null) {
37 // 打印响应长度
38 System.out.println("Response content length: " + resEntity.getContentLength());
39 // 打印响应内容
40 System.out.println(EntityUtils.toString(resEntity, Charset.forName("UTF-8")));
41 }
42
43 // 销毁
44 EntityUtils.consume(resEntity);
45 }catch (Exception e){
46 e.printStackTrace();
47 }finally {
48 try {
49 if(response != null){
50 response.close();
51 }
52 } catch (IOException e) {
53 e.printStackTrace();
54 }
55
56 try {
57 if(httpClient != null){
58 httpClient.close();
59 }
60 } catch (IOException e) {
61 e.printStackTrace();
62 }
63 }
64 }
服务端处理
无论客户端是哪种上传方式,服务端的处理都是一样的。在通过HttpServletRequest获得参数之后,把得到的Item进行分类,分为普通的表单和File表单。
通过ServletFileUpload 可以设置上传文件的大小及编码格式等。
总之,服务端的处理是把得到的参数当做HTML表单进行处理的。
1 public void processUpload(HttpServletRequest request, HttpServletResponse response){
2 File uploadFile = new File(uploadPath);
3 if (!uploadFile.exists()) {
4 uploadFile.mkdirs();
5 }
6
7 System.out.println("Come on, baby .......");
8
9 request.setCharacterEncoding("utf-8");
10 response.setCharacterEncoding("utf-8");
11
12 //检测是不是存在上传文件
13 boolean isMultipart = ServletFileUpload.isMultipartContent(request);
14
15 if(isMultipart){
16 DiskFileItemFactory factory = new DiskFileItemFactory();
17
18 //指定在内存中缓存数据大小,单位为byte,这里设为1Mb
19 factory.setSizeThreshold(1024*1024);
20
21 //设置一旦文件大小超过getSizeThreshold()的值时数据存放在硬盘的目录
22 factory.setRepository(new File("D:\temp"));
23
24 // Create a new file upload handler
25 ServletFileUpload upload = new ServletFileUpload(factory);
26
27 // 指定单个上传文件的最大尺寸,单位:字节,这里设为50Mb
28 upload.setFileSizeMax(50 * 1024 * 1024);
29
30 //指定一次上传多个文件的总尺寸,单位:字节,这里设为50Mb
31 upload.setSizeMax(50 * 1024 * 1024);
32 upload.setHeaderEncoding("UTF-8");
33
34 List<FileItem> items = null;
35
36 try {
37 // 解析request请求
38 items = upload.parseRequest(request);
39 } catch (FileUploadException e) {
40 e.printStackTrace();
41 }
42
43 if(items!=null){
44 //解析表单项目
45 Iterator<FileItem> iter = items.iterator();
46 while (iter.hasNext()) {
47 FileItem item = iter.next();
48
49 //如果是普通表单属性
50 if (item.isFormField()) {
51 //相当于input的name属性 <input type="text" name="content">
52 String name = item.getFieldName();
53
54 //input的value属性
55 String value = item.getString();
56
57 System.out.println("属性:" + name + " 属性值:" + value);
58 }
59 //如果是上传文件
60 else {
61 //属性名
62 String fieldName = item.getFieldName();
63
64 //上传文件路径
65 String fileName = item.getName();
66 fileName = fileName.substring(fileName.lastIndexOf("/") + 1);// 获得上传文件的文件名
67
68 try {
69 item.write(new File(uploadPath, fileName));
70 } catch (Exception e) {
71 e.printStackTrace();
72 }
73 }
74 }
75 }
76 }
77
78 response.addHeader("token", "hello");
79 }
服务端在处理之后,可以在Header中设置返回给客户端的简单信息。如果返回客户端是一个流的话,流的大小必须提前设置!
response.setContentLength((int) file.length());
