zoukankan      html  css  js  c++  java
  • 正则表达式过滤HTML、JS、CSS

    功能用途

    主要是用来提取html页面内容时使用。

    示例代码

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.IO;
    using System.Net;
    using System.Net.NetworkInformation;
    using System.Net.Sockets;
    using System.Threading;
    using System.Text.RegularExpressions;
    namespace HtmlRegex
    {
        public class BaseRegex
        {
            WebClient web = new WebClient();
            public void DeBug(string path,int encoding,string content)
            {
                Encoding encods;
                if (encoding == 1)
                    encods = Encoding.UTF8;
                else
                    encods = Encoding.Default;
                StreamWriter sw = new StreamWriter(path,true ,encods);
                sw.WriteLine(content);
                sw.Flush();
                sw.Close();
            }
            public string getPageContent(string url, int encoding)
            {
                byte[] buff = web.DownloadData(url);
                if (encoding == 1)
                {
                    return Encoding.UTF8.GetString(buff);
                }
                return Encoding.Default.GetString(buff);
            }
            public string checkHtml(string html)
            {
                //过滤JS和CSS
                Regex regex1 = new Regex(@"<script.*?>.+?</script>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex2 = new Regex(@"<style.*?>.+?</style>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex3 = new Regex(@"<script.*?>.*?</script>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex4 = new Regex(@"<style.*?>.*?</style>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                
                Regex regex5 = new Regex(@"<.*?>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex6 = new Regex(@"&S{2,}?;", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex7 = new Regex(@"<!--.+?-->", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex8 = new Regex(@"[
    ]{2,}", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                //HTML标签包括自闭和标签
                //Regex regex9 = new Regex(@"<(.*)(.*)>.*</1>|<(.*) />", RegexOptions.Singleline | RegexOptions.IgnoreCase);
    
                html = regex1.Replace(html, "");
                html = regex2.Replace(html, "");
                html = regex3.Replace(html, "");
                html = regex4.Replace(html, "");
                html = regex5.Replace(html, "");
                html = regex6.Replace(html, "");
                html = regex7.Replace(html, "");
                html = regex8.Replace(html, "");
                html = html.Replace(" ", "");
                return html;
            }
        }
    }
  • 相关阅读:
    浅谈树状数组与线段树
    BZOJ1367:[Baltic2004]sequence
    浅谈左偏树
    BZOJ4003:[JLOI2015]城池攻占
    BZOJ2809:[APIO2012]dispatching
    BZOJ1455:罗马游戏
    模拟ssh远程执行命令
    基于TCP协议的socket套接字编程
    计算机网络基础知识
    元类( 控制对象产生和控制类产生)模板
  • 原文地址:https://www.cnblogs.com/shya/p/2439443.html
Copyright © 2011-2022 走看看