zoukankan      html  css  js  c++  java
  • HtmlAgilityPack中文乱码问题

    1. 打开HtmlAgilityPack.1.4.0.Source工程   
    2. 找到HtmlWeb.cs文件打开修改下面方法中的一小段代码:  
    3.     private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc, IWebProxy proxy,  
    4.                                    ICredentials creds)函数中的下方的代码  
    5.             Encoding respenc = !string.IsNullOrEmpty(resp.ContentEncoding)  
    6.                                   ? Encoding.GetEncoding(resp.ContentEncoding)  
    7.                                   : null;  
    8.             /*修改成下面的即可*/  
    9.             /*王..修改 中文乱码问题*/  
    10.              //Encoding respenc = !string.IsNullOrEmpty(resp.ContentEncoding)  
    11.             //                       ? Encoding.GetEncoding(resp.ContentEncoding)  
    12.             //                       : null;  
    13.             System.Text.Encoding respenc;  
    14.   
    15.             if ((resp.ContentEncoding != null) && (resp.ContentEncoding.Length > 0))  
    16.             {  
    17.                 respenc = System.Text.Encoding.GetEncoding(resp.ContentEncoding);  
    18.             }  
    19.             else if ((resp.CharacterSet != null) && (resp.CharacterSet.Length > 0))//根据Content-Type中获取的charset  
    20.             {  
    21.                 if (string.Compare(resp.CharacterSet, "ISO-8859-1"true, System.Globalization.CultureInfo.InvariantCulture) == 0)  
    22.                     respenc = System.Text.Encoding.GetEncoding("GB2312");  
    23.                 else  
    24.                     respenc = System.Text.Encoding.GetEncoding(resp.CharacterSet);  
    25.             }  
    26.             else  
    27.             {  
    28.                 respenc = System.Text.Encoding.GetEncoding("GB2312");  
    29.             }  
    /// <summary>
            
    /// 获取指定URL的HTML源代码
            
    /// </summary>
            
    /// <param name="url"></param>
            
    /// <param name="encoding">如果为NULL 则自动识别</param>
            
    /// <returns></returns>
            public static string GetWebHtml(string url, Encoding encoding)
            {
                
    try
                {
                    HttpWebRequest hwr 
    = (HttpWebRequest)HttpWebRequest.Create(url);
                    HttpWebResponse res;

                    
    try
                    {
                        res 
    = (HttpWebResponse)hwr.GetResponse();
                    }
                    
    catch
                    {
                        
    return string.Empty;
                    }

                    
    if (res.StatusCode == HttpStatusCode.OK)
                    {
                        
    using (Stream mystream = res.GetResponseStream())
                        {
                            
    //没有指定编码,
                            if (encoding == null)
                            {
                                
    return DecodeData(mystream, res);
                            }
                            
    //指定了编码
                            else
                            {
                                
    using (StreamReader reader = new StreamReader(mystream, encoding))
                                {
                                    
    return reader.ReadToEnd();
                                }
                            }
                        }
                    }

                    
    return null;
                }
                
    catch
                {
                    
    return null;
                }
            }


            
    private static string DecodeData(Stream responseStream, HttpWebResponse response)
            {
                
    string name = null;
                
    string text2 = response.Headers["content-type"];
                
    if (text2 != null)
                {
                    
    int index = text2.IndexOf("charset=");
                    
    if (index != -1)
                    {
                        name 
    = text2.Substring(index + 8);
                    }
                }
                MemoryStream stream 
    = new MemoryStream();
                
    byte[] buffer = new byte[0x400];
                
    for (int i = responseStream.Read(buffer, 0, buffer.Length); i > 0; i = responseStream.Read(buffer, 0, buffer.Length))
                {
                    stream.Write(buffer, 
    0, i);
                }
                responseStream.Close();
                
    if (name == null)
                {
                    MemoryStream stream3 
    = stream;
                    stream3.Seek((
    long)0, SeekOrigin.Begin);
                    
    string text3 = new StreamReader(stream3, Encoding.ASCII).ReadToEnd();
                    
    if (text3 != null)
                    {
                        
    int startIndex = text3.IndexOf("charset=");
                        
    int num4 = -1;
                        
    if (startIndex != -1)
                        {
                            num4 
    = text3.IndexOf("\"", startIndex);
                            
    if (num4 != -1)
                            {
                                
    int num5 = startIndex + 8;
                                name 
    = text3.Substring(num5, (num4 - num5) + 1).TrimEnd(new char[] { '>''"' });
                            }
                        }
                    }
                }
                Encoding aSCII 
    = null;
                
    if (name == null)
                {
                    aSCII 
    = Encoding.GetEncoding("gb2312");
                }
                
    else
                {
                    
    try
                    {
                        
    if (name == "GBK")
                        {
                            name 
    = "GB2312";
                        }
                        aSCII 
    = Encoding.GetEncoding(name);
                    }
                    
    catch
                    {
                        aSCII 
    = Encoding.GetEncoding("gb2312");
                    }
                }
                stream.Seek((
    long)0, SeekOrigin.Begin);
                StreamReader reader2 
    = new StreamReader(stream, aSCII);
                
    return reader2.ReadToEnd();
            } 
     string Html = XINLG.Labs.Utils.NetUtil.GetWebHtml("http://www.cnblogs.com/pick/"null);
                HtmlDocument doc 
    = new HtmlDocument();
                doc.LoadHtml(Html); 
  • 相关阅读:
    linux权限补充:rwt rwT rws rwS 特殊权限
    关于Linux操作系统下文件特殊权限的解释
    Java学习笔记——Java程序运行超时后退出或进行其他操作的实现
    Java实现 蓝桥杯 算法提高 判断名次
    Java实现 蓝桥杯 算法提高 判断名次
    Java实现 蓝桥杯 算法提高 日期计算
    Java实现 蓝桥杯 算法提高 日期计算
    Java实现 蓝桥杯 算法提高 概率计算
    Java实现 蓝桥杯 算法提高 概率计算
    Java实现 蓝桥杯 算法提高 复数四则运算
  • 原文地址:https://www.cnblogs.com/jes_shaw/p/2247632.html
Copyright © 2011-2022 走看看