zoukankan      html  css  js  c++  java
  • C#用正则表达式 获取标签的属性或值

    整理两个 在C#中,用正则表达式 获取网页源代码标签的属性或值的方法 :

    1、获取标签中的值: string str="<a href="www.csdn.net" class="main" >CSDN</a>" 结果:CSDN

     调用例子:string name=GetTitleContent(str,"a");

    /// <summary>
    /// 获取字符中指定标签的值
    /// </summary>
    /// <param name="str">字符串</param>
    /// <param name="title">标签</param>
    /// <returns>值</returns>
    public static string GetTitleContent(string str, string title)
    {
    string tmpStr = string.Format("<{0}[^>]*?>(?<Text>[^<]*)</{1}>", title, title); //获取<title>之间内容

    Match TitleMatch = Regex.Match(str, tmpStr, RegexOptions.IgnoreCase);

    string result = TitleMatch.Groups["Text"].Value;
    return result;
    }

    2、获取标签中的属性: string str="<a href="www.csdn.net" class="main">CSDN</a>"  获取 “href” 的结果:www.csdn.net  

    调用例子:string href=GetTitleContent(str,"a","href");

    /// <summary>
    /// 获取字符中指定标签的值
    /// </summary>
    /// <param name="str">字符串</param>
    /// <param name="title">标签</param>
    /// <param name="attrib">属性名</param>
    /// <returns>属性</returns>
    public static string GetTitleContent(string str, string title,string attrib)
    {

    string tmpStr = string.Format("<{0}[^>]*?{1}=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>", title, attrib); //属性值

    Match TitleMatch = Regex.Match(str, tmpStr, RegexOptions.IgnoreCase);

    string result = TitleMatch.Groups["url"].Value;
    return result;
    }

    注:以上方法为获取字符串中第一个结果的值。可以使用Foreach读取TitileMath中所有的匹配属性或值。

    3。获取 <div class="brand_items"> 跟 </div> 之间的内容,<div class="brand_items"> 跟 </div> 可以重复出现

    比如 :

    <div class="cont_left">
      <div class="cont_left_items"><a href="left.php?cid=QDE=" class="ma">数码手机</a></div>
      <div class="cont_left_items"><a href="left.php?cid=QTE=" class="ma">整机电教</a></div>
      <div class="cont_left_items"><a href="left.php?cid=QjE=" class="ma">办公耗材</a></div>
      <div class="cont_left_items2"><a href="left.php?cid=RDE=" class="ma2">硬件周边</a></div>
      <div class="cont_left_items"><a href="left.php?cid=RTE=" class="ma">网络通讯</a></div>
      <div class="cont_left_items"><a href="left.php?cid=RzE=" class="ma">安防监控</a></div>
      <div class="cont_left_items"><a href="left.php?cid=QDFAMQ==" class="ma">IT综合</a></div>
      </div>
      <div class="cont_right">
      <div class="cont_right_items2"><a href="prod_list.php?cid=RDE=&sid=SDE=" class="ma3" target="_parent">摄像头</a></div>
      <div class="rg_brand_list"> <div class="brand_items"><a href="prod_list.php?bid=QDFBMUYx&cid=RDE=&sid=SDE=" target="_parent" class="ma2">罗技</a></div>
      <div class="brand_items"><a href="prod_list.php?bid=QDFCMUQx&cid=RDE=&sid=SDE=" target="_parent" class="ma2">新贵</a></div>
      <div class="brand_items"><a href="prod_list.php?bid=QDFDMUYx&cid=RDE=&sid=SDE=" target="_parent" class="ma2">清华紫光</a></div>
       
       
      <div class="brand_items"><a href="prod_list.php?bid=QDFDMUUx&cid=RDE=&sid=SDE=" target="_parent" class="ma2">达克浩思</a></div>
      <div class="brand_items"><a href="prod_list.php?bid=QDFGMUYxPzFBMQ==&cid=RDE=&sid=SDE=" target="_parent" class="ma2">ASJ奥视嘉</a></div>
      <div class="brand_items"><a href="prod_list.php?bid=RzE/MUUxRDFHMQ==&cid=RDE=&sid=SDE=" target="_parent" class="ma2">百视通</a></div>
      <div class="brand_items"><a href="prod_list.php?bid=RzE/MUgxSDE/MQ==&cid=RDE=&sid=SDE=" target="_parent" class="ma2">创新</a></div>
      <div class="brand_items"><a href="prod_list.php?bid=RzFAMT8xPzE/MQ==&cid=RDE=&sid=SDE=" target="_parent" class="ma2">海畅</a></div>
      <div class="brand_items"><a href="prod_list.php?bid=RzFAMT8xQDFEMQ==&cid=RDE=&sid=SDE=" target="_parent" class="ma2">昂达</a></div>
      <div class="brand_items"><a href="prod_list.php?bid=RzFAMT8xQjFAMQ==&cid=RDE=&sid=SDE=" target="_parent" class="ma2">网眼</a></div>
      <div class="brand_items"><a href="prod_list.php?bid=RzFBMUMxPzFAMQ==&cid=RDE=&sid=SDE=" target="_parent" class="ma2">网缘</a></div>
      <div class="brand_items"><a href="prod_list.php?bid=RzFDMUUxQjFGMQ==&cid=RDE=&sid=SDE=" target="_parent" class="ma2">黑石</a></div>
      <div class="brand_items"><a href="prod_list.php?bid=QDFBMUgx&cid=RDE=&sid=SDE=" target="_parent" class="ma2">奥尼ANC</a></div>
      <div class="brand_items"><a href="prod_list.php?bid=QDFCMUYx&cid=RDE=&sid=SDE=" target="_parent" class="ma2">鼎易</a></div>
       
      <div class="brand_items"><a href="prod_list.php?bid=RzFCMUcxQDE/MQ==&cid=RDE=&sid=SDE=" target="_parent" class="ma2">清华同方</a></div>
      </div> <div class="cont_right_items"><a href="prod_list.php?cid=RDE=&sid=QzFBMQ==" class="ma3" target="_parent">液晶显示器</a></div>
      <div class="cont_right_items"><a href="prod_list.php?cid=RDE=&sid=RDE/MQ==" class="ma3" target="_parent">主板</a></div>
      <div class="cont_right_items"><a href="prod_list.php?cid=RDE=&sid=RDFAMQ==" class="ma3" target="_parent">显卡</a></div>
      <div class="cont_right_items"><a href="prod_list.php?cid=RDE=&sid=RDFBMQ==" class="ma3" target="_parent">CPU</a></div>
       
      <div class="cont_right_items"><a href="prod_list.php?cid=RDE=&sid=RTFAMQ==" class="ma3" target="_parent">散热器</a></div>
      <div class="cont_right_items"><a href="prod_list.php?cid=RDE=&sid=RzFDMQ==" class="ma3" target="_parent">键盘</a></div>

    Regex regex = new Regex("(?<=(<div class="brand_items">))[.\s\S]*?(?=(</div>))", RegexOptions.IgnoreCase);

    for (Match match = regex.Match(content); match.Success; match = match.NextMatch())

    {

        string d=match.Groups[0].ToString();//每个<div class="brand_items"> </div>里的内容

    }

    获取<div class="info_mid_left"> 跟 <div class="endarea"> 之间的内容,唯一性   <div class="info_mid_left"> 跟 <div class="endarea"> 不重复出现

    Regex regex1 = new Regex("(?<=(<div class="info_mid_left">))[.\s\S]*?(?=(<div class="endarea">))", RegexOptions.IgnoreCase);

    string Pcontent = regex1.Match(content).Groups[0].Value;

    获取 <td class="paramFontS2" style="text-align:center; font-size:13px;">技嘉 H61M-DS2DVI </td>     td里的Text内容,td有多种不确定属性的时候

    Regex regex2 = new Regex("<td[^>]*?>(?<Text>[^<]*)</td>", RegexOptions.IgnoreCase);

    for (Match match2 = regex2.Match(d1); match2.Success; match2 = match2.NextMatch())

    {

       name = match2.Groups["Text"].Value;

    }

     获取input控件的value值 

    public string FindValueByName(string str, string inputname)
    {
    //string reg = @"<input name=""(?<name>.*?)"" [sS]*?value=""(?<value>.*?)"" [sS]*?>";
    string reg = "<input[^>]+name="*(?<name>[^\s">]+)"*[^>]*value="*(?<value>[^\s">]+)"*[^>]*>";
    Regex r = new Regex(reg, RegexOptions.None);
    Match match = r.Match(str);
    string aa = "";
    while (match.Success)
    {
    string name = match.Groups["name"].ToString();
    string value = match.Groups["value"].ToString();
    if (name == inputname)
    {
    return value;
    }
    else
    {
    match = match.NextMatch();
    }
    }
    return aa;
    }

    清除所有a标签:string str1 = Regex.Replace(str, @"</?a[^>]*>", "");

    清除所有script 包括script里面的代码      str1 = Regex.Replace(str, @"<script[^>]*>([sS](?!<script))*?</script>", "");

    玉环人才吧 玉环人才网 玉环人力网 玉环招聘网 三支脚人才网

    玉环人力网(又名玉环人才网)是玉环人才吧旗下一流的玉环人力资源招聘行业服务品牌,汇集海量玉环人才,开放玉环人才简历和玉环岗位,提供玉环招聘网,玉环人才市场动态行情,玉环劳务派遣,玉环培训等,是玉环最大的人才网站。

  • 相关阅读:
    Leaf-spine data center architectures
    centreon 画图x轴乱码
    二分图匹配
    牛客练习赛17
    HDU-4550-贪心
    HDU-4511-ac自动机+dp
    UVA-11761-马尔可夫/记忆化搜索
    HDU-3853-期望/dp/坑
    HDU-4405-期望dp
    zoj-3329-期望/dp/方程优化
  • 原文地址:https://www.cnblogs.com/taizhouxiaoba/p/3673037.html
Copyright © 2011-2022 走看看