zoukankan      html  css  js  c++  java
  • 网络爬虫速成指南(一)网页下载

    
    
    注解:此处仅仅是介绍一些类库及常规使用,如果要详细了解Http协议推荐看下《Http权威指南》
    
    

    。net 方向 主要是用到HttpWebRequest下载内容:

    JAVA方向:
    主要是用到HttpClient下载内容
    示例代码:
    相关类库(httpclient-4.1.2 httpcore-4.1.4
    示例代码c#:
    package com.data.crawl.qa.baiduzhidao;

    import java.io.ByteArrayOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.UnsupportedEncodingException;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.Map;
    import java.util.Set;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;

    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    import org.apache.http.Header;
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpStatus;
    import org.apache.http.NameValuePair;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.config.CookieSpecs;
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.entity.UrlEncodedFormEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.client.protocol.HttpClientContext;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    import org.apache.http.message.BasicNameValuePair;

    import org.apache.http.config.Registry;
    import org.apache.http.config.RegistryBuilder;
    import org.apache.http.cookie.CookieSpecProvider;
    import org.apache.http.impl.cookie.BestMatchSpecFactory;
    import org.apache.http.impl.cookie.BrowserCompatSpecFactory;

    /**
     * HttpClient连接池
     *
     * @author wqj
     *
     */
    public class HttpClientPool {
        
        private static Log log = LogFactory.getLog(HttpClientPool.class);
        
        /**
         * 最大HttpClient连接数
         */
        private final int MAX_TOTAL_CONNECTIONS = 10;

        /**
         * HttpClient连接池
         */
        private PoolingHttpClientConnectionManager connectionManager;


        /**
         * cookie 上下文
         */
        protected HttpClientContext context = null;


        /**
         * default constructor
         */
        public HttpClientPool(){
            connectionManager = new PoolingHttpClientConnectionManager();
            /* 连接池最大生成连接数200 */
            connectionManager.setMaxTotal(MAX_TOTAL_CONNECTIONS);
            /* 默认设置route最大连接数为20 */
            connectionManager.setDefaultMaxPerRoute(10);
         // 实例化cookie
            context = HttpClientContext.create();
            Registry<CookieSpecProvider> registry = RegistryBuilder.<CookieSpecProvider> create()
                    .register(CookieSpecs.BEST_MATCH, new BestMatchSpecFactory())
                    .register(CookieSpecs.BROWSER_COMPATIBILITY, new BrowserCompatSpecFactory()).build();
            context.setCookieSpecRegistry(registry);
        }

        /**
         * 从线程池实例化HttpClient
         *
         * @return
         */
        private CloseableHttpClient getHttpClient() {
            int socketTimeOut = 120000;
            int connectionTimeOut = 60000;

            RequestConfig config = RequestConfig.custom().setSocketTimeout(socketTimeOut)
                    .setConnectTimeout(connectionTimeOut).setCookieSpec(CookieSpecs.BEST_MATCH).build();
            return HttpClients.custom().setDefaultRequestConfig(config).setConnectionManager(connectionManager).build();
        }

        /**
         * Post方式
         */
        public String Post(String uri, Map<String, String> params) {
            CloseableHttpClient httpclient = getHttpClient();
            HttpPost httpost = new HttpPost(uri);
            List<NameValuePair> post_data = new ArrayList<NameValuePair>();

            Set<String> keySet = params.keySet();
            for (String key : keySet) {
                post_data.add(new BasicNameValuePair(key, params.get(key)));
            }

            CloseableHttpResponse response = null;

            try {
                httpost.setEntity(new UrlEncodedFormEntity(post_data, "UTF-8"));
                response = httpclient.execute(httpost, context);
                
                //默认编码
                String charset = "utf-8";            
                HttpEntity entity = response.getEntity();

                String html = null;
                if (entity != null) {
                    InputStream in = entity.getContent();

                    /* 侦测编码 */
                    ByteArrayOutputStream swapStream = new ByteArrayOutputStream();
                    byte[] buff = new byte[1024];
                    int rc = 0;
                    while ((rc = in.read(buff, 0, 1024)) > 0) {
                        swapStream.write(buff, 0, rc);
                    }
                    byte[] data = swapStream.toByteArray();

                    String charset_1 = Icu4jDetector.getEncode(data);
                    charset = charset_1 == null ? charset : charset_1;

                    html = new String(data, charset);
                    System.out.println(html);
                    in.close();
                }
                return html;
            } catch (UnsupportedEncodingException e) {
                log.error(e.getMessage());
            } catch (ClientProtocolException e) {
                log.error(e.getMessage());
            } catch (IOException e) {
                log.error(e.getMessage());
            }
            return null;
        }

        /**
         * 模拟登陆时,访问首页时使用此方法,此方法不带cookie
         *
         * @param uri 统一资源定位符
         * @return html文档
         */
        public String downHtml(String uri) {
            CloseableHttpClient httpclient = getHttpClient();
            HttpGet httpget = new HttpGet(uri);
            CloseableHttpResponse response = null;

            try {
                response = httpclient.execute(httpget);

                /* 判断访问的状态码 */
                int statusCode = response.getStatusLine().getStatusCode();
                if (statusCode != HttpStatus.SC_OK) {
                    log.info("request failed: " + response.getStatusLine());
                    return null;
                }

                /* 侦测编码 */
                Pattern pattern = Pattern.compile("text/html;[\s]*charset=(.*)");
                Header[] arr = response.getHeaders("Content-Type");
                String charset = "utf-8";
                if (arr != null) {
                    String content = arr[0].getValue().toLowerCase();
                    Matcher m = pattern.matcher(content);
                    if (m.find()) {
                        charset = m.group(1);
                    }
                }

                HttpEntity entity = response.getEntity();
                String html = null;
                if (entity != null) {
                    InputStream in = entity.getContent();

                    /* 侦测编码 */
                    ByteArrayOutputStream swapStream = new ByteArrayOutputStream();
                    byte[] buff = new byte[1024];
                    int rc = 0;
                    while ((rc = in.read(buff, 0, 1024)) > 0) {
                        swapStream.write(buff, 0, rc);
                    }
                    byte[] data = swapStream.toByteArray();

                    String charset_1 = Icu4jDetector.getEncode(data);
                    charset = charset_1 == null ? charset : charset_1;

                    html = new String(data, charset);
                    in.close();
                }
                return html;

            } catch (ClientProtocolException e) {
               log.info(e.getMessage());
            } catch (IOException e) {
                log.info(e.getMessage());
            }
            return null;
        }

    }

  • 相关阅读:
    new delete的内部实现代码
    子串的替换
    求字符串的长度
    TSQL语句学习(四)
    TSQL语句学习(二)
    杭电acm1036
    杭电acm2032
    杭电acm2045
    杭电acm2072
    杭电acm1029
  • 原文地址:https://www.cnblogs.com/i80386/p/3259085.html
Copyright © 2011-2022 走看看