zoukankan      html  css  js  c++  java
  • crawler_基础之_java.net.HttpURLConnection 访问网络资源

    java访问网络资源 由底层到封装  为  scoket==> java.net.HttpURLConnection==>HttpClient

    这次阐述先 java.net.HttpURLConnection 的方式 ,好处是用导包 ,jdk原生自带的。

    HtmlUtil 包含尝试重连(3次) ,编码识别,保存文件到磁盘

    package com.cph.crawler.core.utils;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.io.UnsupportedEncodingException;
    import java.net.HttpURLConnection;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.net.URLEncoder;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    
    /**
     * 类说明:html有关的操作 <br>
     * 2012-9-22下午08:22:20创建<br>
     * 
     * @author cphmvp
     */
    public final class HtmlUtil {
        public final static Log LOG = LogFactory.getLog(HtmlUtil.class);
        static String defaultEncoding = "utf-8";
        static HttpURLConnection httpURLConnection = null;
        static URL urlModel = null;
        // 链接超时时间
        static int connectTimeout = 100000;
        // 读取响应超时时间
        static int readTimeout = 100000;
    
        /**
         * 下载图片<br>
         * 
         * @param url
         *            图片的下载地址<br>
         * @param savePath
         *            保存路径<br>
         * @throws IOException
         */
        @SuppressWarnings("resource")
        public static void downloadAndSavePictureToDisk(String url, String savePath)
                throws IOException {
            urlModel = new URL(url);
            httpURLConnection = (HttpURLConnection) urlModel.openConnection();
            httpURLConnection.setConnectTimeout(connectTimeout);
            httpURLConnection.setReadTimeout(readTimeout);
            httpURLConnection.setDoOutput(true);
            InputStream is = httpURLConnection.getInputStream();
            BufferedReader rd = new BufferedReader(new InputStreamReader(is));
            FileOutputStream fw = null;
            File f = new File(savePath.substring(0, savePath.lastIndexOf("/")));
    
            if (!f.exists()) {
                f.mkdirs();
            }
            File eixtsFile = new File(savePath);
            if (eixtsFile.exists()) {
                return;
            }
            fw = new FileOutputStream(savePath, true);
            int num = -1;
            while ((num = is.read()) != (-1))// 是否读完所有数据
            {
                fw.write(num);// 将数据写往文件
            }
            rd.close();
            is.close();
            if (httpURLConnection != null) {
                httpURLConnection.disconnect();
            }
    
        }
    
        /**
         * 讲url后面的参数进行编码
         * 
         * @param url
         * @return
         * @throws UnsupportedEncodingException
         */
        private static String encodParamters(String url)
                throws UnsupportedEncodingException {
            String returnStr = new String(url);
            String regex = "=([^&]+)";
            Pattern p = Pattern.compile(regex);
            Matcher m = p.matcher(url);
            while (m.find()) {
                String replaceStr = m.group(1);
                returnStr = returnStr.replaceFirst(replaceStr,
                        URLEncoder.encode(replaceStr, "utf-8"));
            }
            return returnStr;
        }
    
        /**
         * 获取会话的JSESSIONID
         * 
         * @param url
         * @return
         */
        public static String getSession(String url) {
            String sessionId = "";
            try {
                urlModel = new URL(url);
                httpURLConnection = (HttpURLConnection) urlModel.openConnection();
                httpURLConnection.setConnectTimeout(connectTimeout);
                httpURLConnection.setReadTimeout(readTimeout);
                String cookieVal = null;
                String key = null;
                for (int i = 1; (key = httpURLConnection.getHeaderFieldKey(i)) != null; i++) {
                    if (key.equalsIgnoreCase("set-cookie")) {
                        cookieVal = httpURLConnection.getHeaderField(i);
                        cookieVal = cookieVal.substring(0, cookieVal.indexOf(";"));
                        sessionId = sessionId + cookieVal + ";";
                    }
                }
    
            } catch (MalformedURLException e) {
                LOG.error(e);
            } catch (IOException e) {
                LOG.error(e);
            }
            return sessionId;
        }
    
        /**
         * 下载页面</br>
         * 
         * @param page
         *            </br>
         * @return 页面源码
         * @throws IOException
         * @throws UnsupportedEncodingException
         */
        public static StringBuffer downloadHtml(String url,String encoding) {
            StringBuffer sb = new StringBuffer();
            BufferedReader in = null;
            int tryNum = 0;
            while (true) {
                try {
                    if (tryNum > 1) {
                        String ecodingUrl = encodParamters(url);
                        urlModel = new URL(ecodingUrl);
                    } else {
                        urlModel = new URL(url);
                    }
                    httpURLConnection = (HttpURLConnection) urlModel
                            .openConnection();
                    httpURLConnection.setConnectTimeout(connectTimeout);
                    httpURLConnection.setReadTimeout(readTimeout);
                    httpURLConnection
                            .setRequestProperty("User-Agent",
                                    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");
                    String redirectUrl = httpURLConnection.getURL().toString();
                    if (!redirectUrl.equals(url)) {
                        LOG.info(url + "重定向后为" + redirectUrl);
                    }
                    String charSetHeader = httpURLConnection
                            .getHeaderField("Content-Type");
                    String charSet = null;
                    if (charSetHeader != null) {
                        Pattern p = Pattern.compile("charset=["']?(.*?)['"]");
                        Matcher m = p.matcher(charSetHeader);
                        if (m.find())
                            charSet = m.group(1).trim();
                        if (null == charSet) {
                            charSet = encoding;
                        }
                    }
    
                    charSet = (charSet == null ? encoding : charSet);
                    in = new BufferedReader(new InputStreamReader(
                            httpURLConnection.getInputStream(), charSet));
                    String inputLine;
                    while ((inputLine = in.readLine()) != null) {
                        sb.append(inputLine + "
    ");
                        inputLine = null;
                    }
                    if (in != null)
                        try {
                            in.close();
                        } catch (IOException e) {
                            LOG.error(e);
                        }
                    if (httpURLConnection != null)
                        httpURLConnection.disconnect();
                    break;
                } catch (Exception e) {
                    if (tryNum++ == 3) {
                        LOG.error("download page error [ " + urlModel + " ] ");
                        return null;
                    }
                    LOG.warn(tryNum + "次下载失败", e);
                }
            }
            return sb;
    
        }
        /**
         * 下载页面</br>
         * 
         * @param page
         *            </br>
         * @return 页面源码
         * @throws IOException
         * @throws UnsupportedEncodingException
         */
        public static StringBuffer downloadHtml(String url) {
            StringBuffer sb = new StringBuffer();
            BufferedReader in = null;
            int tryNum = 0;
            while (true) {
                try {
                    if (tryNum > 1) {
                        String ecodingUrl = encodParamters(url);
                        urlModel = new URL(ecodingUrl);
                    } else {
                        urlModel = new URL(url);
                    }
                    httpURLConnection = (HttpURLConnection) urlModel
                            .openConnection();
                    httpURLConnection.setConnectTimeout(connectTimeout);
                    httpURLConnection.setReadTimeout(readTimeout);
                    httpURLConnection
                            .setRequestProperty("User-Agent",
                                    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");
                    String redirectUrl = httpURLConnection.getURL().toString();
                    if (!redirectUrl.equals(url)) {
                        LOG.info(url + "重定向后为" + redirectUrl);
                    }
                    String charSetHeader = httpURLConnection
                            .getHeaderField("Content-Type");
                    String charSet = null;
                    if (charSetHeader != null) {
                        Pattern p = Pattern.compile("charset=["']?(.*?)['"]");
                        Matcher m = p.matcher(charSetHeader);
                        if (m.find())
                            charSet = m.group(1).trim();
                        if (null == charSet) {
                            charSet = defaultEncoding;
                        }
                    }
    
                    charSet = (charSet == null ? defaultEncoding : charSet);
                    in = new BufferedReader(new InputStreamReader(
                            httpURLConnection.getInputStream(), charSet));
                    String inputLine;
                    while ((inputLine = in.readLine()) != null) {
                        sb.append(inputLine + "
    ");
                        inputLine = null;
                    }
                    if (in != null)
                        try {
                            in.close();
                        } catch (IOException e) {
                            LOG.error(e);
                        }
                    if (httpURLConnection != null)
                        httpURLConnection.disconnect();
                    break;
                } catch (Exception e) {
                    if (tryNum++ == 3) {
                        LOG.error("download page error [ " + urlModel + " ] ");
                        return null;
                    }
                    LOG.warn(tryNum + "次下载失败", e);
                }
            }
            return sb;
    
        }
    
    }

                                 

  • 相关阅读:
    ls命令输出文件的绝对路径
    grep命令用关系或查询多个字符串
    pthread_cond_timedwait
    移位运算溢出:右操作数须小于左操作数的位数
    Source Insight symbol not found
    break和continue能否跳出函数
    Oracle ORA-01033: ORACLE initialization or shutdown in progress 错误解决办法. 重启服务
    git bash中不能显示中文
    docker初探
    C++ STL常见数据结构(容器)分类
  • 原文地址:https://www.cnblogs.com/cphmvp/p/3474000.html
Copyright © 2011-2022 走看看