zoukankan      html  css  js  c++  java
  • href= 正则表达式

    获取网页的新闻连接:

    代码
    using System;
    using System.Data;
    using System.Configuration;
    using System.Collections;
    using System.Web;
    using System.Web.Security;
    using System.Web.UI;
    using System.Web.UI.WebControls;
    using System.Web.UI.WebControls.WebParts;
    using System.Web.UI.HtmlControls;
    using System.Text.RegularExpressions;
    using System.Text;
    using System.Net;
    namespace WebApplication4
    {
        
    public partial class _Default : System.Web.UI.Page
        {
            
    protected void Page_Load(object sender, EventArgs e)
            {
                
    string strNewsUrl = ConfigurationSettings.AppSettings["NewsUrl"];
                
    string strHTML = DownLoadHtml(strNewsUrl);

                
    int begin = 0;
                
    string strBegin = ConfigurationSettings.AppSettings["BeginStr"];
                
    string strEnd = ConfigurationSettings.AppSettings["EndStr"];
                
    string strContent = "";
                
    if (strBegin.Trim() != "" && strEnd.Trim() != "")
                    strContent 
    = GetHTMLContent(strHTML, strBegin, strEnd, ref begin);
                
    else
                    strContent 
    = strHTML;
                MatchCollection ms 
    = GetUrlFromHtml(strContent);
               
    if (ms.Count > 0)
                    {
                        Response.Write(
    "<marquee onmouseover=this.stop() onmouseout=this.start() scrollDelay=110 class=a><font color=#ff0000>最新消息:</font>");

                        
    foreach (Match m in ms)
                        {
                            
    if (m.Groups[1].Value.Trim() != "" && m.Groups[3].Value.Trim() != "")
                            {

                               
    string href = string.Format("<a href='{0}' title='{2}'  target='_blank'>{1}</a>&nbsp;&nbsp;&nbsp;&nbsp;", m.Groups[1].Value.Trim(), m.Groups[3].Value.Trim(), m.Groups[3].Value.Trim());

                                Response.Write(href);
                            }
                        }
                        Response.Write(
    "</marquee>");
                    }
            }
          
            
    #region 获取网页内容

            
    public static string DownLoadHtml(string url)
            {
                
    string output = "";
                Encoding encode 
    = Encoding.UTF8;
                WebClient webclient 
    = new WebClient();
                
    //System.Net.GlobalProxySelection.Select = System.Net.GlobalProxySelection.GetEmptyWebProxy();

                
    try
                {
            //如果使用代理上网,则使用如下方法,默认是使用IE代理设置
                   //webclient.Proxy = new WebProxy("192.168.8.1"808);
                    webclient.Headers.Add(
    "Referer", url);

                    
    byte[] buff = webclient.DownloadData(url);
                    output 
    = encode.GetString(buff);
                }
                
    catch
                {
                }
                
    return output;
            }

            
    public static string GetHTMLContent(string strTarget, string strBegin, string strEnd, ref int begin)
            {
                
    string result;
                
    int posBegin, posEnd;
                posBegin 
    = strTarget.IndexOf(strBegin, begin);
                
    if (posBegin != -1)
                {
                    posEnd 
    = strTarget.IndexOf(strEnd, posBegin + strBegin.Length);
                    
    if (posEnd > posBegin)
                    {
                        result 
    = strTarget.Substring(posBegin, posEnd + strEnd.Length - posBegin);
                        begin 
    = posEnd + strEnd.Length;
                        
    return result;

                    }
                }
                begin 
    = -1;
                
    return "";
            }

            
    //过滤特殊字符
            public static string RepalceStr(string str)
            {

                str 
    = str.Replace("\r\n""");
                str 
    = str.Replace("\"""");
                str = str.Replace("""");
                str 
    = str.Replace("\t""");
                str 
    = str.Replace("&nbsp;""");
                str 
    = str.Replace("'""");
                str 
    = str.Replace("\r""");
                str 
    = str.Replace("\n""");
                
    return str;
            }

            
    public static MatchCollection GetUrlFromHtml(string strContent)
            {
                
    //string regex = @"\<a.*href\s*=\s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^\>^\s]+)).*\>(?<title>[^\<^\>]*)\<[^\</a\>]*/a\>";
                
    //string regex = @"\<a.*href\s*=\s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^\>^\s]+)).*\>(?<title>[^\<^\>]*)\<[^\</a\>]*/a\>";
                
    //string regex = @"\<a.*href\s*=\s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^\>^\s]+))\s*(.*)\s*title=[""|'](?<remarktext>[\s\S])[""|'].*\>(?<title>[^\<^\>]*)\<[^\</a\>]*/a\>";

                
    //string regex= "href=\"(?<url>[\\s\\S]*?)\"[\\s\\S]title=\"(?<remarktext>[\\s\\S]*?)\"[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
                
    //string regex = "<a.*href\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^\\>^\\s]+)).*\\>[\\s\\S]title=\"(?<remarktext>[\\s\\S]*?)\"[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>"; 

                
    //string regex = "href=\"(?<url>[\\s\\S]*?)\"[\\s\\S]*?title=\"(?<remarktext>[\\s\\S]*?)\"[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";
                
    //string regex = "href=['|\"](?<url>[\\s\\S]*?)['|\"][\\s\\S]*?title='(?<remarktext>[\\s\\S]*?)'[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";

                
    //string regex = "href=(\'|\\\\\")(?<url>[\\s\\S]*?)(\'|\\\\\")[\\s\\S]*?title=\\\\\"(?<remarktext>[\\s\\S]*?)\\\\\"[\\s" +"\\S]*?>(?<title>[\\s\\S]*?)</a>";
                string regex = "<a[\\s]+href[\\s]*=[\\s]*\"([^<\"]+)\" target=\"([^<\"]+)\" title=\"([^<\"]+)\"><span class=\"([^<\"]+)\">([^<\"]+)</span></a>";  
              
                
    //string p = "href=[\"](?<url>[\\s\\S]*?)[\"][\\s\\S]*?title='(?<remarktext>[\\s\\S]*?)'[\\s\\S]*?>(?<title>[\\s\\S]*?)</a>";

                Regex reg 
    = new Regex(regex, RegexOptions.IgnoreCase);
                MatchCollection ms 
    = reg.Matches(strContent);
                
    return ms;

            }

            
    public static string GetNewsUrl()
            {
                
    if (ConfigurationSettings.AppSettings["NewsUrl"== "")
                    
    return "";
                
    else
                    
    return "../desktop/getnewsUrl.aspx";

            }
            
    #endregion
        }
    }

    web.config源码:

    代码
        <appSettings>
            
    <!--新闻获取网址-->
            
    <add key="NewsUrl" value="http://jyj.cixi.gov.cn/"/>
            
    <add key="BeginStr" value="var teshu_shu = 1;"/>
            
    <add key="EndStr" value="/script"/>
            
    <add key="expressMatch"/>
           
    </appSettings>
  • 相关阅读:
    修复TabControl在Binding情况下Canvas被复用的问题
    避免缓加载时因违反惯例导致的空引用!
    乱说一气
    WPF中的数据验证
    [zz]GPU architecture
    [zz]DirectX 11 and Shared Model 5.0
    网页栅格系统中的最佳宽度:960px
    复习html标签及其属性
    去除链接虚线边框css
    使用jquery解决IE6不兼容的伪类
  • 原文地址:https://www.cnblogs.com/zhangzt/p/1884398.html
Copyright © 2011-2022 走看看