zoukankan      html  css  js  c++  java
  • 爬虫简单示例,用httpClient4.2.1实现(转载)

    HttpConnectionManager.java

    package spider;
    import java.io.BufferedReader;
    import java.io.ByteArrayOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.util.ArrayList;
    import java.util.Collection;
    import java.util.Date;
    import java.util.zip.GZIPInputStream;

    import javax.net.ssl.SSLHandshakeException;

    import org.apache.commons.lang.StringUtils;
    import org.apache.http.Header;
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpEntityEnclosingRequest;
    import org.apache.http.HttpHost;
    import org.apache.http.HttpRequest;
    import org.apache.http.HttpResponse;
    import org.apache.http.HttpVersion;
    import org.apache.http.NoHttpResponseException;
    import org.apache.http.ParseException;
    import org.apache.http.StatusLine;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.HttpRequestRetryHandler;
    import org.apache.http.client.entity.GzipDecompressingEntity;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.params.ClientPNames;
    import org.apache.http.client.params.CookiePolicy;
    import org.apache.http.conn.params.ConnManagerParams;
    import org.apache.http.conn.params.ConnRoutePNames;
    import org.apache.http.conn.routing.HttpRoute;
    import org.apache.http.conn.scheme.PlainSocketFactory;
    import org.apache.http.conn.scheme.Scheme;
    import org.apache.http.conn.scheme.SchemeRegistry;
    import org.apache.http.conn.ssl.SSLSocketFactory;
    import org.apache.http.entity.ContentType;
    import org.apache.http.impl.client.DefaultHttpClient;
    import org.apache.http.impl.conn.PoolingClientConnectionManager;
    import org.apache.http.message.BasicHeader;
    import org.apache.http.params.BasicHttpParams;
    import org.apache.http.params.CoreConnectionPNames;
    import org.apache.http.params.CoreProtocolPNames;
    import org.apache.http.params.HttpParams;
    import org.apache.http.protocol.ExecutionContext;
    import org.apache.http.protocol.HttpContext;
    import org.apache.http.util.EntityUtils;


    /**
     * http连接、抓取管理类
     * 
    @author lidongyang
     * @createtime Oct 18, 2012 1:55:18 PM
     * 
     * @note 基本测试版
     
    */
    public class HttpConnectionManager {
        
        /** 
         * 连接池里的最大连接数
         
    */  
        public static final int MAX_TOTAL_CONNECTIONS = 100;
        
        /** 
         * 每个路由的默认最大连接数
         
    */  
        public static final int MAX_ROUTE_CONNECTIONS = 50;
        
        /** 
         * 连接超时时间
         
    */  
        public static final int CONNECT_TIMEOUT = 50000;
        
        /**
         * 套接字超时时间
         
    */
        public static final int SOCKET_TIMEOUT = 50000;
        
        /**
         * 连接池中 连接请求执行被阻塞的超时时间
         
    */
        public static final long CONN_MANAGER_TIMEOUT = 60000;
        
        /**
         * http连接相关参数
         
    */
        private static HttpParams parentParams;
        
        /**
         * http线程池管理器
         
    */
        private static PoolingClientConnectionManager cm;
        
        /**
         * http客户端
         
    */
        private static DefaultHttpClient httpClient;
        
        /**
         * 默认目标主机
         
    */
        private static final HttpHost DEFAULT_TARGETHOST = new HttpHost("http://www.qq.com", 80);
        
        /**
         * 初始化http连接池,设置参数、http头等等信息
         
    */
        static {
            SchemeRegistry schemeRegistry = new SchemeRegistry();
            schemeRegistry.register(
                     new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
            schemeRegistry.register(
                     new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));

            cm = new PoolingClientConnectionManager(schemeRegistry);

            cm.setMaxTotal(MAX_TOTAL_CONNECTIONS);
            
            cm.setDefaultMaxPerRoute(MAX_ROUTE_CONNECTIONS);

            cm.setMaxPerRoute(new HttpRoute(DEFAULT_TARGETHOST), 20);        //设置对目标主机的最大连接数
            
            parentParams = new BasicHttpParams(); 
            parentParams.setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);

            parentParams.setParameter(ClientPNames.DEFAULT_HOST, DEFAULT_TARGETHOST);    //设置默认targetHost
            
            parentParams.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
            
            parentParams.setParameter(ClientPNames.CONN_MANAGER_TIMEOUT, CONN_MANAGER_TIMEOUT);
            parentParams.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, CONNECT_TIMEOUT);
            parentParams.setParameter(CoreConnectionPNames.SO_TIMEOUT, SOCKET_TIMEOUT);
            
            parentParams.setParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS, true);
            parentParams.setParameter(ClientPNames.HANDLE_REDIRECTS, true);
            
            //设置头信息,模拟浏览器
            Collection
     collection = new ArrayList
    ();
            collection.add(new BasicHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)"));
            collection.add(new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));
            collection.add(new BasicHeader("Accept-Language", "zh-cn,zh,en-US,en;q=0.5"));
            collection.add(new BasicHeader("Accept-Charset", "ISO-8859-1,utf-8,gbk,gb2312;q=0.7,*;q=0.7"));
            collection.add(new BasicHeader("Accept-Encoding", "gzip, deflate"));
            
            parentParams.setParameter(ClientPNames.DEFAULT_HEADERS, collection);
            //请求重试处理
            HttpRequestRetryHandler httpRequestRetryHandler = new HttpRequestRetryHandler() {
                public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
                    if (executionCount >= 5) {
                        // 如果超过最大重试次数,那么就不要继续了
                        return false;
                    }
                    if (exception instanceof NoHttpResponseException) {
                        // 如果服务器丢掉了连接,那么就重试
                        return true;
                    }
                    if (exception instanceof SSLHandshakeException) {
                        // 不要重试SSL握手异常
                        return false;
                    }
                    HttpRequest request = (HttpRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);
                    boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
                    if (idempotent) {
                        // 如果请求被认为是幂等的,那么就重试
                        return true;
                    }
                    return false;
                }
            };
            
            httpClient = new DefaultHttpClient(cm, parentParams);
            
            httpClient.setHttpRequestRetryHandler(httpRequestRetryHandler);
        }
        
        /**
         * 抓取页面代码
         * 
    @param url 目标页面的url
         * 
    @return 页面代码
         
    */
        public String getHtml(String url) {
            HttpHost proxyHost = new HttpHost("211.142.236.137", 8080);//代理
            
            String html = getHtml(url, proxyHost);
            
            int count = 0;
            while(StringUtils.isEmpty(html)){
                proxyHost = new HttpHost("211.142.236.137", 80);//更换代理
                html = getHtml(url, proxyHost);
                count++;
                if(count > 3){
                    System.out.println("抓取失败");
                    break;
                }
            }
            
    System.out.println(html.length());
            return html;
        }
        
        /**
         * 抓取url所指的页面代码
         * 
    @param url 目标页面的url
         * 
    @return 页面代码
         
    */
        public String getHtml(String url, HttpHost proxyHost) {
            String html = "";
            HttpGet httpGet = new HttpGet(url);
            httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);//设置代理
            
            HttpResponse httpResponse;
            HttpEntity httpEntity;
            try {
                httpResponse = httpClient.execute(httpGet);
                
                StatusLine statusLine = httpResponse.getStatusLine();
                int statusCode = statusLine.getStatusCode();
    System.out.println(statusCode);
                if(200 != statusCode) {
                    return html;
                }
                
                httpEntity = httpResponse.getEntity();
                if(httpEntity != null){
                    html = readHtmlContentFromEntity(httpEntity);
                }
            } catch (ClientProtocolException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } finally {
                if(httpGet != null){
                    httpGet.releaseConnection();
                }
            }
            
            return html;
        }
        
        /**
         * 从response返回的实体中读取页面代码
         * 
    @param httpEntity Http实体
         * 
    @return 页面代码
         * 
    @throws ParseException
         * 
    @throws IOException
         
    */
        private String readHtmlContentFromEntity(HttpEntity httpEntity) throws ParseException, IOException {
            String html = "";
            Header header = httpEntity.getContentEncoding();
            if(httpEntity.getContentLength() < 2147483647L){            //EntityUtils无法处理ContentLength超过2147483647L的Entity
                if(header != null && "gzip".equals(header.getValue())){
                    html = EntityUtils.toString(new GzipDecompressingEntity(httpEntity));
                } else {
                    html = EntityUtils.toString(httpEntity);
                }
            } else {
                InputStream in = httpEntity.getContent();
                if(header != null && "gzip".equals(header.getValue())){
                    html = unZip(in, ContentType.getOrDefault(httpEntity).getCharset().toString());
                } else {
                    html = readInStreamToString(in, ContentType.getOrDefault(httpEntity).getCharset().toString());
                }
                if(in != null){
                    in.close();
                }
            }
            return html;
        }
        
        /**
         * 测试代理是否可用(其实和getHtml(String url, HttpHost proxyHost)的代码差不多,为了从功能上区别,暂时这样)
         * 
    @param httpHost 封装了代理的ip地址和端口
         * 
    @param url 用来测试的页面
         * 
    @return true 可用 false 不可用
         
    */
        public boolean isProxyUsable(HttpHost proxyHost, String url) {
            HttpGet httpGet = new HttpGet(url);
            httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);
            try {
                HttpResponse httpResponse = httpClient.execute(httpGet);
                
                StatusLine statusLine = httpResponse.getStatusLine();
                int statusCode = statusLine.getStatusCode();
    System.out.println(statusCode);
                if(200 != statusCode) {
                    return false;
                }
                HttpEntity httpEntity = httpResponse.getEntity();
                if(httpEntity != null) {
                    String html = readHtmlContentFromEntity(httpEntity);
    System.out.println(html.length());
                    if(StringUtils.isEmpty(html)){
                        return false;
                    }
                } else {
                    return false;
                }
                
            } catch (ClientProtocolException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                return false;
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                return false;
            }
            
            return true;
        }
        
        /**
         * 解压服务器返回的gzip流
         * 
    @param in 抓取返回的InputStream流
         * 
    @param charSet 页面内容编码
         * 
    @return 页面内容的String格式
         * 
    @throws IOException
         
    */
        private String unZip(InputStream in, String charSet) throws IOException {
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            GZIPInputStream gis = null;
            try {
                gis = new GZIPInputStream(in);
                byte[] _byte = new byte[1024];
                int len = 0;
                while ((len = gis.read(_byte)) != -1) {
                    baos.write(_byte, 0, len);
                }
                String unzipString = new String(baos.toByteArray(), charSet);
                return unzipString;
            } finally {
                if (gis != null) {
                    gis.close();
                }
                if(baos != null){
                    baos.close();
                }
            }
        }
        
        /**
         * 读取InputStream流
         * 
    @param in InputStream流
         * 
    @return 从流中读取的String
         * 
    @throws IOException
         
    */
        private String readInStreamToString(InputStream in, String charSet) throws IOException {
            StringBuilder str = new StringBuilder();
            String line;
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(in, charSet));
            while((line = bufferedReader.readLine()) != null){
                str.append(line);
                str.append("\n");
            }
            if(bufferedReader != null) {
                bufferedReader.close();
            }
            return str.toString();
        }
        
        /**
         * for test
         * 
    @author lidongyang
         * @createtime Oct 18, 2012 2:35:09 PM
         
    */
        public class Test implements Runnable {
            String url;
            int threadNum;
            
            public Test() {
                
            }
            
            public Test(String url, int threadNum) {
                this.url = url;
                this.threadNum = threadNum;
            }
            
            @Override
            public void run() {
                getHtml(url);
            }
        }
        
        
        /**
         * for test
         * 
    @param args
         * 
    @throws InterruptedException 
         
    */
        public static void main(String[] args) throws InterruptedException{
            HttpConnectionManager httpConnectionManager = new HttpConnectionManager();
            Date start = new Date();
            httpConnectionManager.getHtml("http://www.qq.com");
            Date end = new Date();
            System.out.println((end.getTime() - start.getTime())/1000.0 + " 秒");
        }
    }

    GetQqNews.java

     package parser;


    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;

    import spider.HttpConnectionManager;

    /** test
     * 
    @author lidongyang
     * @createtime Oct 23, 2012 11:05:33 AM
     
    */
    public class GetQqNews {
        
        
        public static void main(String[] args){
            HttpConnectionManager httpConnectionManager = new HttpConnectionManager();
            String html = httpConnectionManager.getHtml("http://www.qq.com");
            Document doc = Jsoup.parse(html);
            Elements newsList = doc.select("[class=ft fl]").select("ul").select("li").select("a");
            for (Element element : newsList) {
                System.out.println(element.attr("href") + "----" + element.text());
            }
        }
    }
  • 相关阅读:
    一键保存网页为PDF
    Redis使用总结之与Memcached异同
    wxWidgets的安装编译、相关配置、问题分析处理
    python抓取网页图片
    bootstrap插件学习-bootstrap.popover.js
    CC.NET模板简化配置
    密码技术应用系列之开篇
    【Cocos2d-X开发学习笔记】第05期:渲染框架之布景层类(CCLayer)的使用
    ImageMagick还是GraphicsMagick?
    RESTClient 控件 从服务器获得数据集 REST
  • 原文地址:https://www.cnblogs.com/chenying99/p/2737924.html
Copyright © 2011-2022 走看看