zoukankan      html  css  js  c++  java
  • 新人补钙系列教程之:正则过滤

    '功能:使用正则表示式对字符串进行替换
    复制内容到剪贴板 程序代码

    1. Function RegReplace(Str, PatternStr, RepStr)
    2.     Dim NewStr, regEx
    3.     NewStr = Str
    4.     If IsNull(NewStr) Then
    5.         RegReplace = ""
    6.         Exit Function
    7.     End If
    8.     Set regEx = New RegExp
    9.     regEx.IgnoreCase = True
    10.     regEx.Global = True
    11.     regEx.Pattern = PatternStr
    12.     NewStr = regEx.Replace(NewStr, RepStr)
    13.     RegReplace = NewStr
    14. End Function
    复制代码

    '过滤HTML各种标签样式脚本

    1. Function HTMLFilter(sHTML, sFilters)
    2.     If sHTML & "" = "" Then Exit Function
    3.     If sFilters & "" = "" Then sFilters = "SCRIPT,OBJECT"
    4.     Dim aFilters
    5.      aFilters = Split(UCase(sFilters), ",")
    6.     For i = 0 To UBound(aFilters)
    7.         Select Case UCase(Trim(aFilters(i)))
    8.             Case "JORKIN"
    9.                 Do While InStr(sHTML, "     ") >0
    10.                     sHTML = Replace(sHTML, "     ", "    ")
    11.                 Loop
    12.             Case "SCRIPT"
    13.                 '// 去除脚本<scr ipt></scr ipt>及 onload 等
    14.                 sHTML = RegReplace(sHTML, "<SCRIPT[\s\S]*?</SCRIPT>", "")
    15.                 sHTML = RegReplace(sHTML, "\s[on].+?=\s+?([\""|\'])(.*?)\1", "")
    16.                 sHTML = RegReplace(sHTML, "(JAVASCRIPT|JSCRIPT|VBSCRIPT|VBS):", "$1:")
    17.             Case "FIXIMG"
    18.                 sHTML = RegReplace(sHTML, "<IMG.*?\sSRC=([^\""\'\s][^\""\'\s>]*).*?>", "<img src=$2 border=0>")
    19.                 sHTML = RegReplace(sHTML, "<IMG.*SRC=([\""\']?)(.\1\S+).*?>", "<img src=$2 border=0>")
    20.             Case "TABLE"
    21.                 '// 去除表格<table><tr><td><th>
    22.                 sHTML = RegReplace(sHTML, "</?TABLE[^>]*>", "")
    23.                 sHTML = RegReplace(sHTML, "</?TBODY[^>]*>", "")
    24.                 sHTML = RegReplace(sHTML, "<(/?)TR[^>]*>", "<$1p>")
    25.                 sHTML = RegReplace(sHTML, "</?TH[^>]*>", " ")
    26.                 sHTML = RegReplace(sHTML, "</?TD[^>]*>", " ")
    27.             Case "CLASS"
    28.                 '// 去除样式类class=""
    29.                 sHTML = RegReplace(sHTML, "(<[^>]+) CLASS=[^ |^>]+([^>]*>)", "$1 $2")
    30.                 sHTML = RegReplace(sHTML, "\sCLASS\s*?=\s*?([\""|\'])(.*?)\1", "")
    31.             Case "STYLE"
    32.                 '// 去除样式
    33.                 sHTML = RegReplace(sHTML, "(<[^>]+) STYLE=[^ |^>]+([^>]*>)", "$1 $2")
    34.                 sHTML = RegReplace(sHTML, "\sSTYLE\s*?=\s*?([\""|\'])(.*?)\1", "")
    35.             Case "XML"
    36.                 '// 去除XML<?xml>
    37.                 sHTML = RegReplace(sHTML, "<\\?XML[^>]*>", "")
    38.             Case "NAMESPACE"
    39.                 '// 去除命名空间<o:p></o:p>
    40.                 sHTML = RegReplace(sHTML, "<\/?[a-z]+:[^>]*>", "")
    41.             Case "FONT"
    42.                 '// 去除字体<font></font>
    43.                 sHTML = RegReplace(sHTML, "</?FONT[^>]*>", "")
    44.             Case "MARQUEE"
    45.                 '// 去除字幕<marquee></marquee>
    46.                 sHTML = RegReplace(sHTML, "</?MARQUEE[^>]*>", "")
    47.             Case "OBJECT"
    48.                 '// 去除对象<object><param><embed></object>
    49.                 sHTML = RegReplace(sHTML, "</?OBJECT[^>]*>", "")
    50.                 sHTML = RegReplace(sHTML, "</?PARAM[^>]*>", "")
    51.                 sHTML = RegReplace(sHTML, "</?EMBED[^>]*>", "")
    52.             Case "COMMENT"
    53.                 '// 去除HTML注释, 会处理<script>和<style>内注释, 慎用
    54.                 sHTML = RegReplace(sHTML, "<!--[\s\S]*?-->", "")
    55.             Case Else
    56.                 '// 去除其它标签
    57.                 sHTML = RegReplace(sHTML, "</?" & aFilters(i) & "[^>]*?>", "")
    58.         End Select
    59.     Next
    60.     HTMLFilter = sHTML
    61. End Function
    复制代码

    过滤全部html

    复制内容到剪贴板 程序代码

    <\/*[^<>]*>


    过滤 style

    复制内容到剪贴板 程序代码

    (<style)+[^<>]*>[^\0]*(<\/style>)+


    过滤 层 div

    复制内容到剪贴板 程序代码

    <(\/){0,1}div[^<>]*>


    过滤 链接 a :

    复制内容到剪贴板 程序代码

    <(\/){0,1}a[^<>]*>


    过滤 字体 font

    复制内容到剪贴板 程序代码

    <(\/){0,1}font[^<>]*>


    过滤 span 

    复制内容到剪贴板 程序代码

    <(\/){0,1}span[^<>]*>


    过滤 object 

    复制内容到剪贴板 程序代码

    <object.*?/object>

    过滤 iframe

    复制内容到剪贴板 程序代码

    (<iframe){1,}[^<>]*>[^\0]*(<\/iframe>){1,}

    过滤 script:

    复制内容到剪贴板 程序代码

    (<script){1,}[^<>]*>[^\0]*(<\/script>){1,}


    过滤 Class 

    复制内容到剪贴板 程序代码

    (class=){1,}(""|\'){0,1}\S+(""|\'|>|\s){0,1}过滤 style 和 strong

    复制内容到剪贴板 程序代码

    <(style|strong)[^>]*>|<\/(style|strong)>

    过滤 img

    复制内容到剪贴板 程序代码

    <(img)[^>]*>|<\/(img)>


    过滤 table tr td 等

    复制内容到剪贴板 程序代码

    <(table|tbody|tr|td|th)[^>]*>|<\/(table|tbody|tr|td|th)>


    过滤

    复制内容到剪贴板 程序代码

    <(div|blockquote|fieldset|legend)[^>]*>|<\/(div|blockquote|fieldset|legend)>


    过滤


    复制内容到剪贴板 程序代码

    <(font|i|u|h[1-9]|s)[^>]*>|<\/(font|i|u|h[1-9]|s)>


    过滤


    复制内容到剪贴板 程序代码

    <(style|strong)[^>]*>|<\/(style|strong)>


    过滤

    复制内容到剪贴板 程序代码

    <a[^>]*>|<\/a>


    过滤

    复制内容到剪贴板 程序代码

    <(meta|iframe|frame|span|tbody|layer)[^>]*>|<\/(iframe|frame|meta|span|tbody|layer)>


    过滤

    复制内容到剪贴板 程序代码

    <br[^>]*

  • 相关阅读:
    boost::asio在VS2008下的编译错误
    Java集合框架——接口
    ACM POJ 3981 字符串替换(简单题)
    ACM HDU 1042 N!(高精度计算阶乘)
    OneTwoThree (Uva)
    ACM POJ 3979 分数加减法(水题)
    ACM HDU 4004 The Frog's Games(2011ACM大连赛区第四题)
    Hexadecimal View (2011ACM亚洲大连赛区现场赛D题)
    ACM HDU 4002 Find the maximum(2011年大连赛区网络赛第二题)
    ACM HDU 4001 To Miss Our Children Time (2011ACM大连赛区网络赛)
  • 原文地址:https://www.cnblogs.com/keng333/p/2299769.html
Copyright © 2011-2022 走看看