zoukankan      html  css  js  c++  java
  • HttpClient抓取带有压缩性质的网页

    HttpClient抓取带有压缩性质的网页,需要一个解码的过程,如果缺少该过程则会呈现乱码的状态。

    package com.yangbo.examples;
    import java.io.BufferedReader;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.io.UnsupportedEncodingException;
    import java.sql.Connection;
    import java.sql.PreparedStatement;
    import java.sql.ResultSet;
    import java.util.ArrayList;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    import java.util.zip.GZIPInputStream;
    
    import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
    import org.apache.commons.httpclient.HttpClient;
    import org.apache.commons.httpclient.HttpException;
    import org.apache.commons.httpclient.HttpMethod;
    import org.apache.commons.httpclient.HttpStatus;
    import org.apache.commons.httpclient.methods.GetMethod;
    import org.apache.commons.httpclient.params.HttpMethodParams;
    
    
    public class MobileInformationPconline {
    
        /**
         * 用正则表达式来提取抓取下来的html中的信息
         * @throws HttpException 
         * @throws IOException 
         */
        public String getHtmlContent(String htmlurl, String charset)
                throws IOException {
            StringBuffer sb = new StringBuffer();
            String acceptEncoding = "";
            /* 1.生成 HttpClinet 对象并设置参数 */
            HttpClient httpClient = new HttpClient();
            // 设置 Http 连接超时 5s
            httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(
                    5000);
            GetMethod method = new GetMethod(htmlurl);
            // 设置 get 请求超时 5s
            method.getParams().getDoubleParameter(HttpMethodParams.SO_TIMEOUT, 10000);
            // 设置请求重试处理
            method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
            int statusCode;
            try {
                statusCode = httpClient.executeMethod(method);
                // 判断访问的状态码
                if (statusCode != HttpStatus.SC_OK) {
                    return sb.toString();
                } else {
                    if (method.getResponseHeader("Content-Encoding") != null)
                        acceptEncoding = method
                                .getResponseHeader("Content-Encoding").getValue();
                    if (acceptEncoding.toLowerCase().indexOf("gzip") > -1) {
                        // 建立gzip解压工作流
                        InputStream is;
                        is = method.getResponseBodyAsStream();
                        GZIPInputStream gzin = new GZIPInputStream(is);
                        InputStreamReader isr = new InputStreamReader(gzin, charset); // 设置读取流的编码格式,自定义编码
                        java.io.BufferedReader br = new java.io.BufferedReader(isr);
                        String tempbf;
                        while ((tempbf = br.readLine()) != null) {
                            sb.append(tempbf);
                            sb.append("
    ");
                        }
                        isr.close();
                        gzin.close();
                        //System.out.println(sb);
                    } else {
                        InputStreamReader isr;
                        isr = new InputStreamReader(
                                method.getResponseBodyAsStream(), charset);
                        java.io.BufferedReader br = new java.io.BufferedReader(isr);
                        String tempbf;
                        while ((tempbf = br.readLine()) != null) {
                            sb.append(tempbf);
                            sb.append("
    ");
                        }
                        isr.close();
                    }
                }
            } catch (HttpException e1) {
                e1.printStackTrace();
            } catch (IOException e1) {
                e1.printStackTrace();
            }
            method.abort();
            method.releaseConnection();
            return sb.toString();
        }
        
        public String getHtml(String url){
            HttpClient httpClient=new HttpClient();       
            HttpMethod get=new GetMethod(url);
            String html=""; 
            try {
                        httpClient.executeMethod(get);
                        BufferedReader reader=new BufferedReader(new InputStreamReader(get.getResponseBodyAsStream(),"GB2312"));
                        String tmp=null;
                                  
                        while((tmp=reader.readLine())!=null){
                            html+=tmp+"
    ";
            }
            } catch (HttpException e) {
                        e.printStackTrace();
            } catch (IOException e) {
                        e.printStackTrace();
            }finally{
                        get.releaseConnection();
            }
    
             
            return html;
        }
        
        public ArrayList<String> getMobileUrl(String html){
            Pattern p = null;    //正则表达式
            Matcher m = null;    //操作的字符串
            p = Pattern.compile("<h3><a href=.*</a></h3>");
            m = p.matcher(html);
            ArrayList<String> mobileUrl = new ArrayList<String>();
            String mobileUrlString = null;
            while(m.find()){
                mobileUrlString = "http://detail.zol.com.cn"+m.group().split("href="")[1].split(""")[0];
                mobileUrl.add(mobileUrlString);
            }
            return mobileUrl;
        }
        
        public void insertMobileInformation(String mobileModel,String mobileBrand,String netType){
            Connection ct =null;
            PreparedStatement ps=null;
            ResultSet rs=null;
            
            try {
                //1、获取数据库链接
                ct=SqlHelper.getConnection();
                String[] parameters = {mobileModel,mobileBrand,netType};
                String sql = "insert into mobile_information(mobileModel,mobileBrand,netType,updateTime) values (?,?,?,now())";            
                SqlHelper.executeUpdate(sql,parameters);
    
            } catch (Exception e) {
                e.printStackTrace();
            }finally{
                if(rs!=null){
                    try {
                        rs.close();
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    rs=null;
                }
                if(ps!=null){
                    try {
                        ps.close();
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    ps=null;
                }
                if(ct!=null){
                    try {
                        ct.close();
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    ct=null;
                }
            }
        }
        
        
        public static void main(String[] args) throws IOException{
            String url = "http://product.pconline.com.cn/mobile/25s1.shtml";
            String u = "http://product.pconline.com.cn/mobile/";
            
            for(int i=0;i<125;i++){
                url = u+i*25+"s1.shtml";
                System.out.println(url);            
                
                try {
                    MobileInformationPconline mobileInformationRegex = new MobileInformationPconline();        
                    String html = mobileInformationRegex.getHtmlContent(url, "gb2312");
                    
                    int mobileCount = html.split("<a class="name" href="").length-1;
                    System.out.println(mobileCount);
        
                    for(int j=1;j<=mobileCount;j++){
                        try {
                            String mobileUrl = html.split("<a class="name" href="")[j].split(""")[0];
                            System.out.println(mobileUrl);
                            String mobileModel = html.split("<a class="name" href="")[j].split("target="_blank">")[1].split("</a>")[0];
                            System.out.println(mobileModel);
                            String netType=null;
                            if(html.split("<a class="name" href="")[j].contains("网络制式")){
                                netType = html.split("<a class="name" href="")[j].split("网络制式:</i>")[1].split("</dd>")[0];
                            }else if(html.split("<a class="name" href="")[j].contains("手机制式")){
                                netType = html.split("<a class="name" href="")[j].split("手机制式:</i>")[1].split("</dd>")[0];
                            }else{
                                break;
                            }
                            
                            System.out.println(netType);
                                                
                            String mobileHtml = mobileInformationRegex.getHtmlContent(mobileUrl, "gb2312");
                            String mobileBrand = mobileHtml.split("<div class="crumb fl">")[1].split("title="")[4].split("手机大全")[0];
                            System.out.println(mobileBrand);
                            System.out.println(i*25+j);
                            System.out.println();
                            
                            mobileInformationRegex.insertMobileInformation(mobileModel,mobileBrand,netType);
                        } catch (Exception e) {
                            e.printStackTrace();
                        }
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }    
            }        
        }
    
    }
  • 相关阅读:
    Nginx禁止IP,只允许指定域名访问
    预防vsphere勒索病毒,适用于 ESXi 6.x 中的 OpenSLP 安全漏洞 (CVE-2019-5544) 的权宜措施 (76372)
    解决 vCenter root 密码过期无法登陆 User password expired
    停用Veritas Smart Meter-概念与操作
    centos7.0 没有netstat 和 ifconfig命令问题
    MAVEN剪除传递依赖
    Spring+SpringMVC+MyBatis+Maven框架整合
    CentOS7配置MongoDB
    利用mybatis-generator自动生成代码
    idea提交本地项目到git
  • 原文地址:https://www.cnblogs.com/jingyunyb/p/3501455.html
Copyright © 2011-2022 走看看