zoukankan      html  css  js  c++  java
  • 获取HTML源码(只取文字,判断编码,过滤标签)

    private void button1_Click(object sender, EventArgs e)
            {
                string s1 = this.textBox1.Text;
                //正则表达式内容
                //string match = @"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$";
                //string match = @"[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$";
                string match = @"[a-zA-z]+://[^\s]*";
                //初始化正则表达式实例
                Regex reg = new Regex(match);
                //开始验证
                bool HasValidate = reg.IsMatch(s1);
    
                if (HasValidate)
                {
                    //MessageBox.Show("这是网站有效URL格式。");
                    try
                    {
                        string tmp = GetHtml(s1);
                        string tmpend = StripHTML(tmp);
    
                    }
                    catch (Exception)
                    {
                        //MessageBox.Show("3.该网站只能手动查询!");
                    }
                }
            }

    1.获取HTML

    GetHtml(String Url)

    View Code
            /// <summary>
            /// 获取有效的HTML
            /// </summary>
            /// <param name="Url"></param>
            /// <returns></returns>
            public String GetHtml(String Url)
            {
                string sException = null;
    
                string sRslt = null;
                string GBsRslt = null;
                StreamReader htm = null;
                WebResponse oWebRps = null;
                WebResponse bWebRps = null;
                int a = 0;
    
                WebRequest oWebRqst = WebRequest.Create(Url);
    
                oWebRqst.Timeout = 50000;
    
                WebRequest bWebRqst = WebRequest.Create(Url);
    
                bWebRqst.Timeout = 50000;
    
                try
                {
                    oWebRps = oWebRqst.GetResponse();
                    bWebRps = bWebRqst.GetResponse();
                }
                catch (WebException e)
                {
                    sException = e.Message.ToString();
    
                    MessageBox.Show(sException);
                }
                catch (Exception e)
                {
                    sException = e.ToString();
    
                    MessageBox.Show(sException);
                }
                finally
                {
                    if (oWebRps != null)
                    {
                        StreamReader oStreamRd = new StreamReader(
                            oWebRps.GetResponseStream(), Encoding.GetEncoding("UTF-8")
                            );
    
                        StreamReader GBoStreamRd = new StreamReader(
                            bWebRps.GetResponseStream(), Encoding.GetEncoding("GB2312")
                            );
    
                        sRslt = oStreamRd.ReadToEnd();
                        GBsRslt = GBoStreamRd.ReadToEnd();
    
                        if (!isLuan(sRslt)) //判断utf8是否有乱码
                        {
                            htm = oStreamRd;
                        }
    
                        else
                        {
                            htm = GBoStreamRd;
                        }
    
                        if (htm == oStreamRd)
                        {
                            a = 1;
                        }
                        else
                        {
                            a = 2;
                        }
    
                        oStreamRd.Close();
                        GBoStreamRd.Close();
                        oWebRps.Close();
    
                    }
                }
                if (a == 1)
                {
                    return sRslt;
                }
                else
                {
                    return GBsRslt;
                }
    
            }

      

    2.去除HTML标记(正则表达式)

    StripHTML(string strHtml)

    View Code
     1         /// <summary>
     2         /// 去除HTML标记
     3         /// </summary>
     4         /// <param name="strHtml">包括HTML的源码 </param>
     5         /// <returns>已经去除后的文字</returns>
     6         public static string StripHTML(string strHtml)
     7         {
     8             //regex_str="<script type=\\s*[^>]*>[^<]*?</script>";//替换<script>内容</script>为空格
     9             string regex_str = "(?is)<script[^>]*>.*?</script>";//替换<script>内容</script>为空格
    10             strHtml = Regex.Replace(strHtml, regex_str, "");
    11 
    12             //regex_str="<script type=\\s*[^>]*>[^<]*?</script>";//替换<style>内容</style>为空格
    13             regex_str = "(?is)<style[^>]*>.*?</style>";//替换<style>内容</style>为空格
    14             strHtml = Regex.Replace(strHtml, regex_str, "");
    15 
    16             //regex_str = "(&nbsp;)+";//替换&nbsp;为空格
    17             regex_str = "(?i)&nbsp;";//替换&nbsp;为空格
    18             strHtml = Regex.Replace(strHtml, regex_str, " ");
    19 
    20             //regex_str = "(\r\n)*";//替换\r\n为空
    21             regex_str = @"[\r\n]*";//替换\r\n为空
    22             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);
    23 
    24             //regex_str = "<[^<]*>";//替换Html标签为空
    25             regex_str = "<[^<>]*>";//替换Html标签为空
    26             strHtml = Regex.Replace(strHtml, regex_str, "");
    27 
    28             //regex_str = "\n*";//替换\n为空
    29             regex_str = @"\n*";//替换\n为空
    30             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);
    31 
    32             //可以这样
    33             regex_str = "\t*";//替换\t为空
    34             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);
    35 
    36             //可以
    37             regex_str = "'";//替换'为’
    38             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);
    39 
    40             //可以
    41             regex_str = " +";//替换若干个空格为一个空格
    42             strHtml = Regex.Replace(strHtml, regex_str, "  ", RegexOptions.IgnoreCase);
    43 
    44             Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
    45 
    46             string strOutput = regex.Replace(strHtml, "");//替换掉"<"和">"之间的内容
    47             strOutput = strOutput.Replace("<", "");
    48             strOutput = strOutput.Replace(">", "");
    49             strOutput = strOutput.Replace("&nbsp;", "");
    50 
    51 
    52             return strOutput;
    53 
    54         }

    3.判断是否为乱码(编码):在StripHTML里调用

    View Code
            //判断是否为乱码
            bool isLuan(string txt)
            {
    
                var bytes = Encoding.UTF8.GetBytes(txt);
    
                //239 191 189
    
                for (var i = 0; i < bytes.Length; i++)
                {
    
                    if (i < bytes.Length - 3)
    
                        if (bytes[i] == 239 && bytes[i + 1] == 191 && bytes[i + 2] == 189)
                        {
    
                            return true;
    
                        }
                }
    
                return false;
    
            }
  • 相关阅读:
    Ubuntu 20.04 不能远程连接
    CentOS 6.8 设置开机自动联网
    JSON 语法
    用友U8 | 【成本管理】用友U8卷积运算时警告提示:‘’有未记账非委外加工入库单代管挂账确认单‘’
    用友U8 | 【总账】总账结账时,对账不平
    用友U8 | 【应收款管理】取消核销操作
    用友U8 | 【总账】账簿明细账打印,选择科目打印,页数范围超过了430页,之后的内容都显示不出来
    用友U8 | 【存货核算】存货模块删除凭证时提示:当前凭证已经有实时核销处理,不能被作废(或删除)!
    用友U8 | 【存货核算】存货核算模块,凭证处理,查询凭证时,会计年度选择不到2021年度
    用友U8 | 【总账】科目辅助总账与科目辅助明细账数据不一样
  • 原文地址:https://www.cnblogs.com/tangge/p/2801547.html
Copyright © 2011-2022 走看看