zoukankan      html  css  js  c++  java
  • 论请求网页时的乱码问题

    乱码问题很烦人,本人始终没有找到很好的解决方案,在一个抓取网页数据的程序中,最后还是使用了WebBrownser.

    开始时使用HttpWebRequest

    /// <summary>
            /// 根据给定的URL获取网页源代码
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static string GetWebPageSource(string url)
            {
                ServicePointManager.ServerCertificateValidationCallback =
                    new System.Net.Security.RemoteCertificateValidationCallback(
                        delegate(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
                        {
                            return true;
                        });
    
                HttpWebRequest request = WebRequest.CreateDefault(new Uri(url)) as HttpWebRequest;
                request.Method = "GET";
                request.Proxy = new WebProxy { UseDefaultCredentials = true };
    
                try
    
                {
                    HttpWebResponse response = request.GetResponse() as HttpWebResponse;
                    System.IO.Stream responseStream = response.GetResponseStream();
    
                    int bytesRead = 0;
                    byte[] buffer = new byte[64 * 1024];
                    MemoryStream stmMemory = new MemoryStream();
    
                    //save stream to byte[]
                    while ((bytesRead = responseStream.Read(buffer, 0, buffer.Length)) > 0)
                    {
                        stmMemory.Write(buffer, 0, bytesRead);
                    }
                    var bytes = stmMemory.ToArray();
                    stmMemory.Close();
    
                    //get charset
                    var html = string.Empty;
                    var charset = response.CharacterSet;
                    if (string.IsNullOrWhiteSpace(charset))
                        charset = "UTF-8";
    
                    //read html as default charset from byte[]
                    var ms = new System.IO.MemoryStream(bytes);
                    System.IO.StreamReader reader = new System.IO.StreamReader(ms, Encoding.GetEncoding(charset));
    
                    //get charset from html
                    var innerCharset = Pubs.DetectCharset(reader.ReadToEnd());
                    if (innerCharset != string.Empty)
                        charset = innerCharset;
    
                    //read html as detected charset from byte[]
                    ms = new System.IO.MemoryStream(bytes);
                    reader = new System.IO.StreamReader(ms, Encoding.GetEncoding(charset));
                    html = reader.ReadToEnd();
    
                    return html;
                }
                catch (Exception)
                {
                    throw;
                }
            }
    
    public static string DetectCharset(string html)
            {
                var wb = new WebBrowser();
                wb.ScriptErrorsSuppressed = true;
                wb.Navigate("about:blank");
                wb.Document.Write(html);
                var heads = wb.Document.GetElementsByTagName("head");
                if (heads.Count > 0)
                {
                    var head = heads[0];
                    foreach (HtmlElement child in head.Children)
                    {
                        if (child.DomElement != null && child.DomElement as mshtml.HTMLMetaElement != null)
                        {
                            var meta = child.DomElement as mshtml.HTMLMetaElement;
                            try
                            {
                                if (meta.charset != null && meta.charset.Trim() != string.Empty)
                                    return meta.charset;
                                if (meta.content != null && meta.content.Contains("charset="))
                                {
                                    return meta.content.Substring(meta.content.IndexOf("charset=") + "charset=".Length);
                                }
                            }
                            catch
                            {
                                continue;
                            }
                            
                        }
                    }
                }
                return string.Empty;
            }
    View Code

    反正已经使用了WebBrownser了,就干脆直接用WebBrowser请求得了,而且没有乱码,代码又少,使用中没发现有什么大的问题,比较稳定,但发现它归根结义是一个COM组件,导致程序运行时内存一直在升,最后找到原因后在适当的位置Dispose就没什么问题了:

    public static WebBrowser GetBrowser(string url, int secondsTimeOut)
            {
                var wb = new WebBrowser();
                wb.ScriptErrorsSuppressed = true;
                wb.Navigate(url);
                var startTime = DateTime.Now ;
                while (true)
                {
                    Application.DoEvents();
                    if (wb.ReadyState == WebBrowserReadyState.Loaded 
                        || wb.ReadyState == WebBrowserReadyState.Complete
                        || startTime.AddSeconds(secondsTimeOut) < DateTime.Now)
                        break;
                }
    
                return wb;
            }
  • 相关阅读:
    【POJ 3669】Meteor Shower
    【BZOJ 1003】[ZJOI2006]物流运输trans
    【POJ 3662】Telephone Lines
    【UVa 1593】Alignment of Code
    【POJ 3661】Running
    [HNOI2015]开店 简要题解
    trie上构建后缀数组
    [CQOI2017]老C的方块
    [JSOI2018]潜入行动 (树形背包)
    李超线段树 总结
  • 原文地址:https://www.cnblogs.com/nanfei/p/3162542.html
Copyright © 2011-2022 走看看