zoukankan      html  css  js  c++  java
  • 截取指定长度html内容,并保留html格式标记

    /// <summary>
        /// 按字节长度截取字符串(支持截取带HTML代码样式的字符串)
        /// </summary>
        /// <param name=”param”>将要截取的字符串参数</param>
        /// <param name=”length”>截取的字节长度</param>
        /// <param name=”end”>字符串末尾补上的字符串</param>
        /// <returns>返回截取后的字符串</returns>
        public string SubstringToHTML(string param, int length, string end)
        {
            string Pattern = null;
            MatchCollection m = null;
            StringBuilder result = new StringBuilder();
            int n = 0;
            char temp;
            bool isCode = false; //是不是HTML代码
            bool isHTML = false; //是不是HTML特殊字符,如&nbsp;
            char[] pchar = param.ToCharArray();
            for (int i = 0; i < pchar.Length; i++)
            {
                temp = pchar[i];
                if (temp == ‘<’)
                {
                    isCode = true;
                }
                else if (temp == ‘&’)
                {
                    isHTML = true;
                }
                else if (temp == ‘>’ && isCode)
                {
                    n = n – 1;
                    isCode = false;
                }
                else if (temp == ‘;’ && isHTML)
                {
                    isHTML = false;
                }

                if (!isCode && !isHTML)
                {
                    n = n + 1;
                    //UNICODE码字符占两个字节
                    if (System.Text.Encoding.Default.GetBytes(temp + “”).Length > 1)
                    {
                        n = n + 1;
                    }
                }

                result.Append(temp);
                if (n >= length)
                {
                    break;
                }
            }
            result.Append(end);
            //取出截取字符串中的HTML标记
            string temp_result = result.ToString().Replace(“(>)[^<>]*(<?)”, “$1$2″);
            //去掉不需要结素标记的HTML标记
            temp_result = temp_result.Replace(@”</?(AREA|BASE|BASEFONT|BODY|BR|COL|COLGROUP|DD|DT|FRAME|HEAD|HR|HTML

    |IMG|INPUT|ISINDEX|LI|LINK|META|OPTION|P|PARAM|TBODY|TD|TFOOT|TH|THEAD

    |TR|area|base|basefont|body|br|col|colgroup|dd|dt|frame|head|hr|html|img|input|isindex|li|link|meta

    |option|p|param|tbody|td|tfoot|th|thead|tr)[^<>]*/?>”,
             “”);
            //去掉成对的HTML标记
            temp_result = temp_result.Replace(@”<([a-zA-Z]+)[^<>]*>(.*?)</1>”, “$2″);
            //用正则表达式取出标记
            Pattern = (“<([a-zA-Z]+)[^<>]*>”);
            m = Regex.Matches(temp_result, Pattern);
            ArrayList endHTML = new ArrayList();
            foreach (Match mt in m)
            {
                endHTML.Add(mt.Result(“$1″));
            }
            //补全不成对的HTML标记
            for (int i = endHTML.Count – 1; i >= 0; i–)
            {
                result.Append(“</”);
                result.Append(endHTML[i]);
                result.Append(“>”);
            }
            return result.ToString();
        }

  • 相关阅读:
    VC 常见问题百问
    python windows 环境变量
    Check server headers and verify HTTP Status Codes
    Where are the AES 256bit cipher suites? Please someone help
    outlook 如何预订会议和会议室
    安装Axis2的eclipse插件后,未出现界面
    windows 环境变量
    python 时间日期处理汇集
    openldap学习笔记(使用openldap2.3.32)
    set p4 environment in windows
  • 原文地址:https://www.cnblogs.com/yinzhilei/p/7767005.html
Copyright © 2011-2022 走看看