zoukankan      html  css  js  c++  java
  • C#获取网页内容,并且处理正确编码

    控制台调用
    static void Main(string[] args) { string code = GetEncodings("http://www.cnblogs.com"); Encoding pp = Encoding.GetEncoding(code); string pl = GetHtml("http://www.cnblogs.com", pp); }

    下面的代码不重要,只是可以获取标题或其它内容

    // 获取网页的HTML内容,根据网页的charset自动判断Encoding 
            static string GetHtml(string url)
            {
                return GetHtmls(url, null);
            }
    
            // 获取网页的HTML内容,指定Encoding 
            static string GetHtmls(string url, Encoding encoding)
            {
                byte[] buf = new WebClient().DownloadData(url);
                if (encoding != null) return encoding.GetString(buf);
                string html = Encoding.UTF8.GetString(buf);
                encoding = GetEncoding(html);
                if (encoding == null || encoding == Encoding.UTF8) return html;
                return encoding.GetString(buf);
            }
    
            // 根据网页的HTML内容提取网页的Encoding 
            static Encoding GetEncoding(string html)
            {
                string pattern = @"(?i)charset=(? <charset>[-a-zA-Z_0-9]+)";
                string charset = Regex.Match(html, pattern).Groups["charset"].Value;
                try { return Encoding.GetEncoding(charset); }
                catch (ArgumentException) { return null; }
            }
    
            // 根据网页的HTML内容提取网页的Title 
            static string GetTitle(string html)
            {
                string pattern = @"(?si) <title(?:s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>(? <title>.*?) </title>";
                return Regex.Match(html, pattern).Groups["title"].Value.Trim();
            }
    
            // 打印网页的Encoding和Title 
            static void PrintEncodingAndTitle(string url)
            {
                string html = GetHtml(url);
                Console.WriteLine("[{0}] [{1}]", GetEncoding(html), GetTitle(html));
            } 
    里面的代码不重要,只是获取其它的内容
    /// <summary>
            /// 获取源代码
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static string GetHtml(string url, Encoding encoding)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                StreamReader reader = null;
                try
                {
                    request = (HttpWebRequest)WebRequest.Create(url);
                    request.Timeout = 20000;
                    request.AllowAutoRedirect = false;
                    response = (HttpWebResponse)request.GetResponse();
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                    {
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                            reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
                        else
                            reader = new StreamReader(response.GetResponseStream(), encoding);
                        string html = reader.ReadToEnd();
                        return html;
                    }
                }
                catch
                {
                }
                finally
                {
                    if (response != null)
                    {
                        response.Close();
                        response = null;
                    }
                    if (reader != null)
                        reader.Close();
                    if (request != null)
                        request = null;
                }
                return string.Empty;
            }
    
            public static string GetEncodings(string url)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                StreamReader reader = null;
                try
                {
                    request = (HttpWebRequest)WebRequest.Create(url);
                    request.Timeout = 20000;
                    request.AllowAutoRedirect = false;
                    response = (HttpWebResponse)request.GetResponse();
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                    {
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                            reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                        else
                            reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
                        string html = reader.ReadToEnd();
                        string pp = html.Substring(html.IndexOf("charset"),100);
                        int p2 = pp.IndexOf(">");
                        pp=pp.Substring(0,p2);
                        pp = pp.Replace("\", "").Replace(""", "").Replace("charset=","").Replace(">","")..Replace("/","").Replace(" ","");;
                        string p3 = pp;
                        return p3;
                        //Regex reg_charset = new Regex(@"charsets*=s*(?<charset>[^""]*)");
                        //if (reg_charset.IsMatch(html))
                        //{
                        //    return reg_charset.Match(html).Groups["charset"].Value;
                        //}
                        //else if (response.CharacterSet != string.Empty)
                        //{
                        //    return response.CharacterSet;
                        //}
                        //else
                        //    return Encoding.Default.BodyName;
                        ////XmlDocument xml = new XmlDocument();
                        ////xml.LoadXml(html); 
                    }
                    return null; 
                    
                }
                catch
                {
                    return null;
                }
                finally
                {
                    if (response != null)
                    {
                        response.Close();
                        response = null;
                    }
                    if (reader != null)
                        reader.Close();
                    if (request != null)
                        request = null;
                }
            }
    这里才是真正的代码,这里一个是获取正确的编码,一个是根据编码解析源码
  • 相关阅读:
    定时机制
    选择排序
    二分插入排序
    无名管道pipe
    Makefile
    Python下划线与命名规范
    Django IDE 开发环境的搭建
    Apache如何添加虚拟目录
    在Eclipse下如何安装插件
    Python的模块、包等概念的理解
  • 原文地址:https://www.cnblogs.com/xiaofengfeng/p/3146072.html
Copyright © 2011-2022 走看看