zoukankan      html  css  js  c++  java
  • 爬虫技术 -- 基础学习(五)解决页面编码识别(附c#代码)

      实现从Web网页提取文本之前,首先要识别网页的编码,有时候还需要进一步识别网页所使用的语言。因为同一种编码可能对应多种语言,例如UTF-8编码可能对应英文或中文等语言。


      识别编码整体流程如下:
      (1)从WEB服务器返回的content type头信息中提取编码,如果是GB2312的编码要当GBK处理。
      (2)从网页mate标签中识别字符编码,如果content type中的编码不一致,以meta中声明的编码为准。
      (3)如果仍然无法确定网页所使用的字符集,需要从返回流的二进制格式判断。
      (4)确定网页所使用的语言,往往采用统计的方法来估计网页的语言。

          判断编码的完整过程如下:(c#代码)

     1         /// <summary>
     2         /// 函数名称:GetDataFromUrl
     3         /// 功能说明:获取url指定的网页的源码
     4         /// 参数:string url用于指定 url
     5         /// 参数:ref Encoding encode用来获取网页中的字符集编码
     6         /// </summary>
     7         public static string GetDataFromUrl(string url, ref Encoding encode)
     8         {
     9             string str = string.Empty;
    10             HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
    11 
    12             //设置http头
    13             request.AllowAutoRedirect = true;
    14             request.AllowWriteStreamBuffering = true;
    15             request.Referer = "";
    16             request.Timeout = 10 * 1000;
    17             request.UserAgent = "";
    18             HttpWebResponse response = null;
    19             response = (HttpWebResponse)request.GetResponse();
    20 
    21             //根据http应答的http头来判断编码
    22             string characterSet = response.CharacterSet;
    23             //Encoding encode;
    24             if (characterSet != "")
    25             {
    26                 if (characterSet == "ISO-8859-1")
    27                 {
    28                     characterSet = "gb2312";
    29                 }
    30                 encode = Encoding.GetEncoding(characterSet);
    31             }
    32             else
    33             {
    34                 encode = Encoding.Default;
    35             }
    36 
    37             //声明一个内存流来保存http应答流
    38             Stream receiveStream = response.GetResponseStream();
    39             MemoryStream mStream = new MemoryStream();
    40 
    41             byte[] bf = new byte[255];
    42             int count = receiveStream.Read(bf, 0, 255);
    43             while (count > 0)
    44             {
    45                 mStream.Write(bf, 0, count);
    46                 count = receiveStream.Read(bf, 0, 255);
    47             }
    48             receiveStream.Close();
    49 
    50             mStream.Seek(0, SeekOrigin.Begin);
    51 
    52             //从内存流里读取字符串
    53             StreamReader reader = new StreamReader(mStream, encode);
    54             char[] buffer = new char[1024];
    55             count = reader.Read(buffer, 0, 1024);
    56             while (count > 0)
    57             {
    58                 str += new String(buffer, 0, count);
    59                 count = reader.Read(buffer, 0, 1024);
    60             }
    61 
    62             //从解析出的字符串里判断charset,如果和http应答的编码不一直
    63             //那么以页面声明的为准,再次从内存流里重新读取文本
    64             Regex reg =
    65                new Regex(@"<meta[sS]+?charset=(.*?)""[sS]+?>",
    66                           RegexOptions.Multiline | RegexOptions.IgnoreCase);
    67             MatchCollection mc = reg.Matches(str);
    68             if (mc.Count > 0)
    69             {
    70                 string tempCharSet = mc[0].Result("$1");
    71                 if (string.Compare(tempCharSet, characterSet, true) != 0)
    72                 {
    73                     encode = Encoding.GetEncoding(tempCharSet);
    74                     str = string.Empty;
    75                     mStream.Seek(0, SeekOrigin.Begin);
    76                     reader = new StreamReader(mStream, encode);
    77                     buffer = new char[255];
    78                     count = reader.Read(buffer, 0, 255);
    79                     while (count > 0)
    80                     {
    81                         str += new String(buffer, 0, count);
    82                         count = reader.Read(buffer, 0, 255);
    83                     }
    84                 }
    85             }
    86             reader.Close();
    87             mStream.Close();
    88             if (response != null)
    89                 response.Close();
    90 
    91             return str;
    92 
    93         }
  • 相关阅读:
    <Learning How to Learn>Week One: Focused versus Diffuse Thinking
    "Principles of Reactive Programming" 之<Actors are Distributed> (2)
    "Principles of Reactive Programming" 之<Actors are Distributed> (1)
    "reactive programming"的概念
    "Principles of Reactive Programming" 之 <Persistent Actor State>学习笔记
    从List[Future[T]]到Future[List[T]]
    用IDEA调试Play工程
    Scala的Pattern Matching Anonymous Functions
    Akka的fault tolerant
    Manifest 与TypeTag
  • 原文地址:https://www.cnblogs.com/lmei/p/3474416.html
Copyright © 2011-2022 走看看