zoukankan      html  css  js  c++  java
  • crawler_httpurlconnection_自动编码识别

    核心思想:

       1:从响应头中读取 【命中解流准确率最高】

       2:如果响应头中没有,打开流从源码中读取,【取舍,如果有一般在前30行会有,前100行中寻找】

           3:如果还没有,根据字节码code位置,字符识别。【前三个字符揣测】

       4:最终依旧没有命中采用,大陆国标编码【概率接近于0 ,gb2312】

    综合效果,尚无测试到编码有问题的站点。

      1 /**
      2      * @declare:下载 自动识别编码
      3      * @param url
      4      * @return
      5      * @author cphmvp
      6      */
      7     public static StringBuffer downloadHtmlAutoCode(String url) {
      8         StringBuffer sb = new StringBuffer();
      9         BufferedReader bufferReader = null;
     10         InputStream inputStream = null;
     11         BufferedInputStream bufferedInputStream = null;
     12         int tryNum = 0;
     13         while (true) {
     14             try {
     15                 if (tryNum > 1) {
     16                     String ecodingUrl = encodParamters(url);
     17                     urlModel = new URL(ecodingUrl);
     18                 } else {
     19                     urlModel = new URL(url);
     20                 }
     21                 httpURLConnection = (HttpURLConnection) urlModel
     22                         .openConnection();
     23                 httpURLConnection.setConnectTimeout(connectTimeout);
     24                 httpURLConnection.setReadTimeout(readTimeout);
     25                 // httpURLConnection.setInstanceFollowRedirects(false);
     26                 // httpURLConnection.setFollowRedirects(true);
     27                 httpURLConnection
     28                         .setRequestProperty("User-Agent",
     29                                 "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");
     30                 String redirectUrl = httpURLConnection.getURL().toString();
     31                 if (!redirectUrl.equals(url)) {
     32                     LOG.info(url + "重定向后为" + redirectUrl);
     33                 }
     34                 // 得到响应流
     35                 inputStream = httpURLConnection.getInputStream();
     36                 if (null == inputStream)
     37                     continue;
     38                 String charSetHeader = httpURLConnection
     39                         .getHeaderField("Content-Type");
     40                 bufferedInputStream = new BufferedInputStream(inputStream);
     41                 String charSet = null;
     42                 // 第一步先从响应头header判断
     43                 if (charSetHeader != null) {
     44                     Pattern p = Pattern.compile("charset=["']?(.*)['"]?");
     45                     Matcher m = p.matcher(charSetHeader);
     46                     if (m.find()) {
     47                         charSet = m.group(1).trim();
     48                     }
     49                 }
     50                 // System.out.println(bufferedInputStream.available() > 0);
     51                 // System.out.println(bufferedInputStream.markSupported());
     52                 // 第二步 从源码中【meta http-equiv="content-type" 】判断
     53                 // if (null == charSet) {
     54                 // charSet = getEncode(bufferedInputStream);
     55                 // System.out.println("---->charSet: 读流识别出来的编码" + charSet);
     56                 // }
     57 
     58                 // 排除非html格式 只有一两行的状况
     59                 if (null == charSet
     60                         && charSetHeader.toLowerCase().contains("html")) {
     61                     // 缓冲区设置大些, read走的信息小于 这个值,就能reset 回来。
     62                     bufferedInputStream.mark(102400);
     63                     bufferReader = new BufferedReader(new InputStreamReader(
     64                             bufferedInputStream));
     65                     int lineNum = 1;
     66                     String inputLine;
     67                     // reset 在读至流的末尾是无法生效,故限制前100行找,找不到 放弃
     68                     while ((inputLine = bufferReader.readLine()) != null
     69                             && lineNum < 100) {
     70                         if (inputLine.toLowerCase().contains("charset")) {
     71                             charSet = RegexUtils.getString(inputLine,
     72                                     "charset=["']?(.*?)["']", 1);
     73                             LOG.info("自动识别出编码:" + charSet);
     74                             // 第一次匹配到后 ,不再往下判断,减少判断行数,及误判概率
     75                             break;
     76                         }
     77                         lineNum++;
     78                         inputLine = null;
     79                     }
     80                     // 第三步奏 穿插补录步奏
     81                     if (null == charSet) {
     82                         byte[] head = new byte[3];
     83                         bufferedInputStream.read(head);
     84                         if (head[0] == -1 && head[1] == -2)
     85                             charSet = "UTF-16";
     86                         if (head[0] == -2 && head[1] == -1)
     87                             charSet = "Unicode";
     88                         if (head[0] == -17 && head[1] == -69 && head[2] == -65)
     89                             charSet = "UTF-8";
     90                     }
     91 
     92                     // 通道回溯
     93                     bufferedInputStream.reset();
     94                 }
     95 
     96                 // 第四步奏指向默认 utf-8
     97                 charSet = (charSet == null ? defaultEncoding : charSet);
     98                 // 第五步奏按照正确编码解码响应流
     99                 bufferReader = new BufferedReader(new InputStreamReader(
    100                         bufferedInputStream, charSet));
    101                 String inputLine;
    102                 while ((inputLine = bufferReader.readLine()) != null) {
    103                     sb.append(inputLine + "
    ");
    104                     inputLine = null;
    105                 }
    106                 if (bufferReader != null)
    107                     try {
    108                         bufferReader.close();
    109                     } catch (IOException e) {
    110                         LOG.error(e);
    111                     }
    112                 if (httpURLConnection != null)
    113                     httpURLConnection.disconnect();
    114                 break;
    115             } catch (Exception e) {
    116                 if (tryNum++ == 3) {
    117                     LOG.error("download page error [ " + urlModel + " ] ");
    118                     return null;
    119                 }
    120                 LOG.warn(tryNum + "次下载失败");
    121             }
    122         }
    123         return sb;
    124 
    125     }
  • 相关阅读:
    修理牛棚 贪心 USACO
    零件加工 贪心 题解
    花店橱窗 动态规划 题解
    动态规划 摆花 题解
    NOIP2004普及组第3题 FBI树
    实况世界杯4小游戏链接
    poj2761(treap入门)
    最大连续子序列和(分治法)
    任意区间的最长连续递增子序列,最大连续子序列和
    lca转RMQ
  • 原文地址:https://www.cnblogs.com/cphmvp/p/3770876.html
Copyright © 2011-2022 走看看