zoukankan      html  css  js  c++  java
  • 正则表达式相关:C# 抓取网页类(获取网页中所有信息)

    类的代码:

      1 using System;  
      2 using System.Data;  
      3 using System.Configuration;  
      4 using System.Net;  
      5 using System.IO;  
      6 using System.Text;  
      7 using System.Collections.Generic;  
      8 using System.Text.RegularExpressions;  
      9 using System.Threading;  
     10 using System.Web;  
     11 using System.Web.UI.MobileControls;  
     12     /// <summary>  
     13     /// 网页类  
     14     /// </summary>  
     15     public class WebPage  
     16     {  
     17         #region 私有成员  
     18         private Uri m_uri;   //url  
     19         private List<Link> m_links;    //此网页上的链接  
     20         private string m_title;        //标题  
     21         private string m_html;         //HTML代码  
     22         private string m_outstr;       //网页可输出的纯文本  
     23         private bool m_good;           //网页是否可用  
     24         private int m_pagesize;       //网页的大小  
     25         private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//存放所有网页的Cookie  
     26         
     27         #endregion  
     28  
     29         #region 属性  
     30   
     31         /// <summary>  
     32         /// 通过此属性可获得本网页的网址,只读  
     33         /// </summary>  
     34         public string URL  
     35         {  
     36             get  
     37             {  
     38                 return m_uri.AbsoluteUri;  
     39             }  
     40         }  
     41   
     42         /// <summary>  
     43         /// 通过此属性可获得本网页的标题,只读  
     44         /// </summary>  
     45         public string Title  
     46         {  
     47             get  
     48             {  
     49                 if (m_title == "")  
     50                 {  
     51                     Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:w|W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);  
     52                     Match mc = reg.Match(m_html);  
     53                     if (mc.Success)  
     54                         m_title = mc.Groups["title"].Value.Trim();  
     55                 }  
     56                 return m_title;  
     57             }  
     58         }  
     59         public string M_html  
     60         {  
     61             get  
     62             {  
     63                 if (m_html == null)  
     64                 {  
     65                     m_html = "";  
     66                 }  
     67                 return m_html;  
     68             }  
     69         }  
     70         /// <summary>  
     71         /// 此属性获得本网页的所有链接信息,只读  
     72         /// </summary>  
     73         public List<Link> Links  
     74         {  
     75             get  
     76             {  
     77                 if (m_links.Count == 0) getLinks();  
     78                 return m_links;  
     79             }  
     80         }  
     81   
     82   
     83         /// <summary>  
     84         /// 此属性返回本网页的全部纯文本信息,只读  
     85         /// </summary>  
     86         public string Context  
     87         {  
     88             get  
     89             {  
     90                 if (m_outstr == "") getContext(Int16.MaxValue);  
     91                 return m_outstr;  
     92             }  
     93         }  
     94   
     95         /// <summary>  
     96         /// 此属性获得本网页的大小  
     97         /// </summary>  
     98         public int PageSize  
     99         {  
    100             get  
    101             {  
    102                 return m_pagesize;  
    103             }  
    104         }  
    105         /// <summary>  
    106         /// 此属性获得本网页的所有站内链接  
    107         /// </summary>  
    108         public List<Link> InsiteLinks  
    109         {  
    110             get  
    111             {  
    112                 return getSpecialLinksByUrl("^http://" + m_uri.Host, Int16.MaxValue);  
    113             }  
    114         }  
    115   
    116         /// <summary>  
    117         /// 此属性表示本网页是否可用  
    118         /// </summary>  
    119         public bool IsGood  
    120         {  
    121             get  
    122             {  
    123                 return m_good;  
    124             }  
    125         }  
    126         /// <summary>  
    127         /// 此属性表示网页的所在的网站  
    128         /// </summary>  
    129         public string Host  
    130         {  
    131             get  
    132             {  
    133                 return m_uri.Host;  
    134             }  
    135         }  
    136         #endregion  
    137   
    138   
    139         /// <summary>  
    140         /// 从HTML代码中分析出链接信息  
    141         /// </summary>  
    142         /// <returns>List<Link></returns>  
    143         private List<Link> getLinks()  
    144         {  
    145             if (m_links.Count == 0)  
    146             {  
    147                 Regex[] regex = new Regex[2];  
    148                 regex[0] = new Regex(@"<ashrefs*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline);  
    149                 regex[1] = new Regex("<[i]*frame[^><]+src=("|')?(?<url>([^>"'\s)])+)("|')?[^>]*>", RegexOptions.IgnoreCase);  
    150   
    151                 for (int i = 0; i < 2; i++)  
    152                 {  
    153                     Match match = regex[i].Match(m_html);  
    154                     while (match.Success)  
    155                     {  
    156                         try  
    157                         {  
    158                             string url = HttpUtility.UrlDecode(new Uri(m_uri, match.Groups["URL"].Value).AbsoluteUri);  
    159   
    160                             string text = "";  
    161                             if (i == 0) text = new Regex("(<[^>]+>)|(\s)|( )|&|"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, "");  
    162   
    163                             Link link = new Link();  
    164                             link.Text = text;  
    165                             link.NavigateUrl = url;  
    166   
    167                             m_links.Add(link);  
    168                         }  
    169                         catch (Exception ex) { Console.WriteLine(ex.Message); };  
    170                         match = match.NextMatch();  
    171                     }  
    172                 }  
    173             }  
    174             return m_links;  
    175         }  
    176         /// <summary>  
    177         /// 此私有方法从一段HTML文本中提取出一定字数的纯文本  
    178         /// </summary>  
    179         /// <param name="instr">HTML代码</param>  
    180         /// <param name="firstN">提取从头数多少个字</param>  
    181         /// <param name="withLink">是否要链接里面的字</param>  
    182         /// <returns>纯文本</returns>  
    183         private string getFirstNchar(string instr, int firstN, bool withLink)  
    184         {  
    185             if (m_outstr == "")  
    186             {  
    187                 m_outstr = instr.Clone() as string;  
    188                 m_outstr = new Regex(@"(?m)<script[^>]*>(w|W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
    189                 m_outstr = new Regex(@"(?m)<style[^>]*>(w|W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
    190                 m_outstr = new Regex(@"(?m)<select[^>]*>(w|W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
    191                 if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(w|W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
    192                 Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);  
    193                 m_outstr = objReg.Replace(m_outstr, "");  
    194                 Regex objReg2 = new System.Text.RegularExpressions.Regex("(\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);  
    195                 m_outstr = objReg2.Replace(m_outstr, " ");  
    196   
    197             }  
    198             return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr;  
    199         }  
    200  
    201  
    202         #region 公有文法  
    203         /// <summary>  
    204         /// 此公有方法提取网页中一定字数的纯文本,包括链接文字  
    205         /// </summary>  
    206         /// <param name="firstN">字数</param>  
    207         /// <returns></returns>  
    208         public string getContext(int firstN)  
    209         {  
    210             return getFirstNchar(m_html, firstN, true);  
    211         }  
    212   
    213         /// <summary>  
    214         /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式  
    215         /// </summary>  
    216         /// <param name="pattern">正则式</param>  
    217         /// <param name="count">返回的链接的个数</param>  
    218         /// <returns>List<Link></returns>  
    219         public List<Link> getSpecialLinksByUrl(string pattern, int count)  
    220         {  
    221             if (m_links.Count == 0) getLinks();  
    222             List<Link> SpecialLinks = new List<Link>();  
    223             List<Link>.Enumerator i;  
    224             i = m_links.GetEnumerator();  
    225             int cnt = 0;  
    226             while (i.MoveNext() && cnt < count)  
    227             {  
    228                 if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.NavigateUrl).Success)  
    229                 {  
    230                     SpecialLinks.Add(i.Current);  
    231                     cnt++;  
    232                 }  
    233             }  
    234             return SpecialLinks;  
    235         }  
    236   
    237         /// <summary>  
    238         /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式  
    239         /// </summary>  
    240         /// <param name="pattern">正则式</param>  
    241         /// <param name="count">返回的链接的个数</param>  
    242         /// <returns>List<Link></returns>  
    243         public List<Link> getSpecialLinksByText(string pattern, int count)  
    244         {  
    245             if (m_links.Count == 0) getLinks();  
    246             List<Link> SpecialLinks = new List<Link>();  
    247             List<Link>.Enumerator i;  
    248             i = m_links.GetEnumerator();  
    249             int cnt = 0;  
    250             while (i.MoveNext() && cnt < count)  
    251             {  
    252                 if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.Text).Success)  
    253                 {  
    254                     SpecialLinks.Add(i.Current);  
    255                     cnt++;  
    256                 }  
    257             }  
    258             return SpecialLinks;  
    259         }  
    260   
    261         /// <summary>  
    262         /// 这公有方法提取本网页的纯文本中满足某正则式的文字  by 何问起
    263         /// </summary>  
    264         /// <param name="pattern">正则式</param>  
    265         /// <returns>返回文字</returns>  
    266         public string getSpecialWords(string pattern)  
    267         {  
    268             if (m_outstr == "") getContext(Int16.MaxValue);  
    269             Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase);  
    270             Match mc = regex.Match(m_outstr);  
    271             if (mc.Success)  
    272                 return mc.Groups[1].Value;  
    273             return string.Empty;  
    274         }  
    275         #endregion  
    276  
    277         #region 构造函数  
    278   
    279         private void Init(string _url)  
    280         {  
    281             try  
    282             {  
    283                 m_uri = new Uri(_url);  
    284                 m_links = new List<Link>();  
    285                 m_html = "";  
    286                 m_outstr = "";  
    287                 m_title = "";  
    288                 m_good = true;  
    289                 if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))  
    290                 {  
    291                     m_good = false;  
    292                     return;  
    293                 }  
    294                 HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);  
    295                 rqst.AllowAutoRedirect = true;  
    296                 rqst.MaximumAutomaticRedirections = 3;  
    297                 rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";  
    298                 rqst.KeepAlive = true;  
    299                 rqst.Timeout = 10000;  
    300                 lock (WebPage.webcookies)  
    301                 {  
    302                     if (WebPage.webcookies.ContainsKey(m_uri.Host))  
    303                         rqst.CookieContainer = WebPage.webcookies[m_uri.Host];  
    304                     else  
    305                     {  
    306                         CookieContainer cc = new CookieContainer();  
    307                         WebPage.webcookies[m_uri.Host] = cc;  
    308                         rqst.CookieContainer = cc;  
    309                     }  
    310                 }  
    311                 HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();  
    312                 Stream sm = rsps.GetResponseStream();  
    313                 if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)  
    314                 {  
    315                     rsps.Close();  
    316                     m_good = false;  
    317                     return;  
    318                 }  
    319                 Encoding cding = System.Text.Encoding.Default;  
    320                 string contenttype = rsps.ContentType.ToLower();  
    321                 int ix = contenttype.IndexOf("charset=");  
    322                 if (ix != -1)  
    323                 {  
    324                     try  
    325                     {  
    326                         cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));  
    327                     }  
    328                     catch  
    329                     {  
    330                         cding = Encoding.Default;  
    331                     }  
    332                      
    333                     //该处视情况而定 有的需要解码  
    334                     //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd());  
    335                     m_html = new StreamReader(sm, cding).ReadToEnd();  
    336                       
    337                 }  
    338                 else  
    339                 {  
    340                   //该处视情况而定 有的需要解码  
    341                    //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd());  
    342                       
    343                     m_html = new StreamReader(sm, cding).ReadToEnd();  
    344                     Regex regex = new Regex("charset=(?<cding>[^=]+)?"", RegexOptions.IgnoreCase);  
    345                     string strcding = regex.Match(m_html).Groups["cding"].Value;  
    346                     try  
    347                     {  
    348                         cding = Encoding.GetEncoding(strcding);  
    349                     }  
    350                     catch  
    351                     {  
    352                         cding = Encoding.Default;  
    353                     }  
    354                     byte[] bytes = Encoding.Default.GetBytes(m_html.ToCharArray());  
    355                     m_html = cding.GetString(bytes);  
    356                     if (m_html.Split('?').Length > 100)  
    357                     {  
    358                         m_html = Encoding.Default.GetString(bytes);  
    359                     }  
    360                 }  
    361                 m_pagesize = m_html.Length;  
    362                 m_uri = rsps.ResponseUri;  
    363                 rsps.Close();  
    364             }  
    365             catch (Exception ex)  
    366             {  
    367                  
    368             }  
    369         }  
    370         public WebPage(string _url)  
    371         {  
    372             string uurl = "";  
    373             try  
    374             {  
    375                 uurl = Uri.UnescapeDataString(_url);  
    376                 _url = uurl;  
    377             }  
    378             catch { };  
    379             Init(_url);  
    380         }  
    381         #endregion  
    382     } 

    调用:

    WebPage webInfo = new WebPage("http://hovertree.net/");  
      
    webInfo.Context;//不包含html标签的所有内容  
      
    webInfo.M_html;//包含html标签的内容  by 何问起

     转自:http://hovertree.com/h/bjaf/jhvb7drd.htm

    推荐:http://www.cnblogs.com/roucheng/p/3521864.html

  • 相关阅读:
    iOS 关于使用xib创建cell的两种初始化方式
    KVO的初级使用
    通知的初级使用
    C语言的变量 常量
    C语言的编译 链接
    1 hello word
    java 中 == 与 equals引出的字符串比较
    02PSP0级及登陆界面开发
    00软工课程引言
    06动手动脑
  • 原文地址:https://www.cnblogs.com/roucheng/p/csfetch.html
Copyright © 2011-2022 走看看