java访问网络资源 由底层到封装 为 scoket==> java.net.HttpURLConnection==>HttpClient
这次阐述先 java.net.HttpURLConnection 的方式 ,好处是用导包 ,jdk原生自带的。
HtmlUtil 包含尝试重连(3次) ,编码识别,保存文件到磁盘
package com.cph.crawler.core.utils; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * 类说明:html有关的操作 <br> * 2012-9-22下午08:22:20创建<br> * * @author cphmvp */ public final class HtmlUtil { public final static Log LOG = LogFactory.getLog(HtmlUtil.class); static String defaultEncoding = "utf-8"; static HttpURLConnection httpURLConnection = null; static URL urlModel = null; // 链接超时时间 static int connectTimeout = 100000; // 读取响应超时时间 static int readTimeout = 100000; /** * 下载图片<br> * * @param url * 图片的下载地址<br> * @param savePath * 保存路径<br> * @throws IOException */ @SuppressWarnings("resource") public static void downloadAndSavePictureToDisk(String url, String savePath) throws IOException { urlModel = new URL(url); httpURLConnection = (HttpURLConnection) urlModel.openConnection(); httpURLConnection.setConnectTimeout(connectTimeout); httpURLConnection.setReadTimeout(readTimeout); httpURLConnection.setDoOutput(true); InputStream is = httpURLConnection.getInputStream(); BufferedReader rd = new BufferedReader(new InputStreamReader(is)); FileOutputStream fw = null; File f = new File(savePath.substring(0, savePath.lastIndexOf("/"))); if (!f.exists()) { f.mkdirs(); } File eixtsFile = new File(savePath); if (eixtsFile.exists()) { return; } fw = new FileOutputStream(savePath, true); int num = -1; while ((num = is.read()) != (-1))// 是否读完所有数据 { fw.write(num);// 将数据写往文件 } rd.close(); is.close(); if (httpURLConnection != null) { httpURLConnection.disconnect(); } } /** * 讲url后面的参数进行编码 * * @param url * @return * @throws UnsupportedEncodingException */ private static String encodParamters(String url) throws UnsupportedEncodingException { String returnStr = new String(url); String regex = "=([^&]+)"; Pattern p = Pattern.compile(regex); Matcher m = p.matcher(url); while (m.find()) { String replaceStr = m.group(1); returnStr = returnStr.replaceFirst(replaceStr, URLEncoder.encode(replaceStr, "utf-8")); } return returnStr; } /** * 获取会话的JSESSIONID * * @param url * @return */ public static String getSession(String url) { String sessionId = ""; try { urlModel = new URL(url); httpURLConnection = (HttpURLConnection) urlModel.openConnection(); httpURLConnection.setConnectTimeout(connectTimeout); httpURLConnection.setReadTimeout(readTimeout); String cookieVal = null; String key = null; for (int i = 1; (key = httpURLConnection.getHeaderFieldKey(i)) != null; i++) { if (key.equalsIgnoreCase("set-cookie")) { cookieVal = httpURLConnection.getHeaderField(i); cookieVal = cookieVal.substring(0, cookieVal.indexOf(";")); sessionId = sessionId + cookieVal + ";"; } } } catch (MalformedURLException e) { LOG.error(e); } catch (IOException e) { LOG.error(e); } return sessionId; } /** * 下载页面</br> * * @param page * </br> * @return 页面源码 * @throws IOException * @throws UnsupportedEncodingException */ public static StringBuffer downloadHtml(String url,String encoding) { StringBuffer sb = new StringBuffer(); BufferedReader in = null; int tryNum = 0; while (true) { try { if (tryNum > 1) { String ecodingUrl = encodParamters(url); urlModel = new URL(ecodingUrl); } else { urlModel = new URL(url); } httpURLConnection = (HttpURLConnection) urlModel .openConnection(); httpURLConnection.setConnectTimeout(connectTimeout); httpURLConnection.setReadTimeout(readTimeout); httpURLConnection .setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)"); String redirectUrl = httpURLConnection.getURL().toString(); if (!redirectUrl.equals(url)) { LOG.info(url + "重定向后为" + redirectUrl); } String charSetHeader = httpURLConnection .getHeaderField("Content-Type"); String charSet = null; if (charSetHeader != null) { Pattern p = Pattern.compile("charset=["']?(.*?)['"]"); Matcher m = p.matcher(charSetHeader); if (m.find()) charSet = m.group(1).trim(); if (null == charSet) { charSet = encoding; } } charSet = (charSet == null ? encoding : charSet); in = new BufferedReader(new InputStreamReader( httpURLConnection.getInputStream(), charSet)); String inputLine; while ((inputLine = in.readLine()) != null) { sb.append(inputLine + " "); inputLine = null; } if (in != null) try { in.close(); } catch (IOException e) { LOG.error(e); } if (httpURLConnection != null) httpURLConnection.disconnect(); break; } catch (Exception e) { if (tryNum++ == 3) { LOG.error("download page error [ " + urlModel + " ] "); return null; } LOG.warn(tryNum + "次下载失败", e); } } return sb; } /** * 下载页面</br> * * @param page * </br> * @return 页面源码 * @throws IOException * @throws UnsupportedEncodingException */ public static StringBuffer downloadHtml(String url) { StringBuffer sb = new StringBuffer(); BufferedReader in = null; int tryNum = 0; while (true) { try { if (tryNum > 1) { String ecodingUrl = encodParamters(url); urlModel = new URL(ecodingUrl); } else { urlModel = new URL(url); } httpURLConnection = (HttpURLConnection) urlModel .openConnection(); httpURLConnection.setConnectTimeout(connectTimeout); httpURLConnection.setReadTimeout(readTimeout); httpURLConnection .setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)"); String redirectUrl = httpURLConnection.getURL().toString(); if (!redirectUrl.equals(url)) { LOG.info(url + "重定向后为" + redirectUrl); } String charSetHeader = httpURLConnection .getHeaderField("Content-Type"); String charSet = null; if (charSetHeader != null) { Pattern p = Pattern.compile("charset=["']?(.*?)['"]"); Matcher m = p.matcher(charSetHeader); if (m.find()) charSet = m.group(1).trim(); if (null == charSet) { charSet = defaultEncoding; } } charSet = (charSet == null ? defaultEncoding : charSet); in = new BufferedReader(new InputStreamReader( httpURLConnection.getInputStream(), charSet)); String inputLine; while ((inputLine = in.readLine()) != null) { sb.append(inputLine + " "); inputLine = null; } if (in != null) try { in.close(); } catch (IOException e) { LOG.error(e); } if (httpURLConnection != null) httpURLConnection.disconnect(); break; } catch (Exception e) { if (tryNum++ == 3) { LOG.error("download page error [ " + urlModel + " ] "); return null; } LOG.warn(tryNum + "次下载失败", e); } } return sb; } }