实现从Web网页提取文本之前,首先要识别网页的编码,有时候还需要进一步识别网页所使用的语言。因为同一种编码可能对应多种语言,例如UTF-8编码可能对应英文或中文等语言。
识别编码整体流程如下:
(1)从WEB服务器返回的content type头信息中提取编码,如果是GB2312的编码要当GBK处理。
(2)从网页mate标签中识别字符编码,如果content type中的编码不一致,以meta中声明的编码为准。
(3)如果仍然无法确定网页所使用的字符集,需要从返回流的二进制格式判断。
(4)确定网页所使用的语言,往往采用统计的方法来估计网页的语言。
判断编码的完整过程如下:(c#代码)
1 /// <summary> 2 /// 函数名称:GetDataFromUrl 3 /// 功能说明:获取url指定的网页的源码 4 /// 参数:string url用于指定 url 5 /// 参数:ref Encoding encode用来获取网页中的字符集编码 6 /// </summary> 7 public static string GetDataFromUrl(string url, ref Encoding encode) 8 { 9 string str = string.Empty; 10 HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url); 11 12 //设置http头 13 request.AllowAutoRedirect = true; 14 request.AllowWriteStreamBuffering = true; 15 request.Referer = ""; 16 request.Timeout = 10 * 1000; 17 request.UserAgent = ""; 18 HttpWebResponse response = null; 19 response = (HttpWebResponse)request.GetResponse(); 20 21 //根据http应答的http头来判断编码 22 string characterSet = response.CharacterSet; 23 //Encoding encode; 24 if (characterSet != "") 25 { 26 if (characterSet == "ISO-8859-1") 27 { 28 characterSet = "gb2312"; 29 } 30 encode = Encoding.GetEncoding(characterSet); 31 } 32 else 33 { 34 encode = Encoding.Default; 35 } 36 37 //声明一个内存流来保存http应答流 38 Stream receiveStream = response.GetResponseStream(); 39 MemoryStream mStream = new MemoryStream(); 40 41 byte[] bf = new byte[255]; 42 int count = receiveStream.Read(bf, 0, 255); 43 while (count > 0) 44 { 45 mStream.Write(bf, 0, count); 46 count = receiveStream.Read(bf, 0, 255); 47 } 48 receiveStream.Close(); 49 50 mStream.Seek(0, SeekOrigin.Begin); 51 52 //从内存流里读取字符串 53 StreamReader reader = new StreamReader(mStream, encode); 54 char[] buffer = new char[1024]; 55 count = reader.Read(buffer, 0, 1024); 56 while (count > 0) 57 { 58 str += new String(buffer, 0, count); 59 count = reader.Read(buffer, 0, 1024); 60 } 61 62 //从解析出的字符串里判断charset,如果和http应答的编码不一直 63 //那么以页面声明的为准,再次从内存流里重新读取文本 64 Regex reg = 65 new Regex(@"<meta[sS]+?charset=(.*?)""[sS]+?>", 66 RegexOptions.Multiline | RegexOptions.IgnoreCase); 67 MatchCollection mc = reg.Matches(str); 68 if (mc.Count > 0) 69 { 70 string tempCharSet = mc[0].Result("$1"); 71 if (string.Compare(tempCharSet, characterSet, true) != 0) 72 { 73 encode = Encoding.GetEncoding(tempCharSet); 74 str = string.Empty; 75 mStream.Seek(0, SeekOrigin.Begin); 76 reader = new StreamReader(mStream, encode); 77 buffer = new char[255]; 78 count = reader.Read(buffer, 0, 255); 79 while (count > 0) 80 { 81 str += new String(buffer, 0, count); 82 count = reader.Read(buffer, 0, 255); 83 } 84 } 85 } 86 reader.Close(); 87 mStream.Close(); 88 if (response != null) 89 response.Close(); 90 91 return str; 92 93 }