zoukankan      html  css  js  c++  java
  • 用正则表达式抓取网页中的ul 和 li标签中最终的值!

                获取你要抓取的页面

                const string URL = "http://www.hn3ddf.gov.cn/price/GetList.html?pageno=1";
                string htmlStr = null;
                for (int i = 0; i < 10; i++)
                {
                    try
                    {
                        System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(URL);
                        request.Headers.Set("Pragma", "no-cache");
                        request.Timeout = 10000 + (i * 5000);
                        System.Net.HttpWebResponse response = (System.Net.HttpWebResponse)request.GetResponse();
                        System.IO.Stream streamReceive = response.GetResponseStream();
                        System.IO.StreamReader streamReader = new System.IO.StreamReader(streamReceive, Encoding.GetEncoding("utf-8"));
                        htmlStr = streamReader.ReadToEnd();
                        break;
                    }
                    catch (Exception e)
                    {
                        //----------------抓取异常!!
                    }
                }

    //抓取页面中的ul 标签中的特定一行属性

               MatchCollection priceList = Regex.Matches(htmlStr, @"<ul style=""font-size:12px;320px; margin:0; padding:0;"">(.*?)</ul>", RegexOptions.Singleline);
                StringBuilder resultStr = new StringBuilder();
                for (int i = 0; i < priceList.Count; i++)
                {
                    try
                    {
                          //<ul style="font-size:12px;320px; margin:0; padding:0;">
                          //  <li style="color:#555555; float:left; display:block; 140px; height:22px; line-height:22px;" align="center">铔嬮浮閰嶅悎楗叉枡</li>
                          //  <li align="center" style="color:#555555; float:left; display:block; 100px; height:22px; line-height:22px;">2.83鍏?鍗冨厠</li>
                          //  <li style="color:#555555; float:left; display:block; 50px;text-align:center; height:22px; line-height:22px;">05-21</li>
                          //</ul>

                        //List<string> list = new List<string>();   //放结果的泛型集合
                        //string splitStr = "</li>";
                        //string[] strArray = priceList[i].Value.Split(splitStr.ToArray());    //一组一组的li标签
                        //foreach (string item in strArray)
                        //{
                        //    int first = item.IndexOf('>');
                        //    int last = item.IndexOf("</li>");
                        //    list.Add(item.Substring(first, last - first));
                        //    //list.add(item.substring(item.indexof(">")));
                        //}
                        //MatchCollection items = Regex.Matches(htmlStr, @"<li.*(?=>)(.| )*?</li>");

                        resultStr.Append("<tr>");

                         //<li style="color:#555555; float:left; display:block; 140px; height:22px; line-height:22px;" align="center">蛋鸡配合饲料</li>

                        //<ul style="font-size:12px;320px; margin:0; padding:0;">
                        //    <li style="color:#555555; float:left; display:block; 140px; height:22px; line-height:22px;" align="center">蛋鸡配合饲料</li>
                        //    <li align="center" style="color:#555555; float:left; display:block; 100px; height:22px; line-height:22px;">2.83元/千克</li>
                        //    <li style="color:#555555; float:left; display:block; 50px;text-align:center; height:22px; line-height:22px;">05-21</li>
                        //</ul>
                        string priceItem = priceList[i].Value;
                        //string name = Regex.Match(priceItem, @"<li style=""color:#555555; float:left; display:block; 140px; height:22px; line-height:22px;"" align=""center"">(.*?)</li>").Value;
    //配备<开头的在抓取的网页中的li标签中的所有属性进行配备为真的一行结果包含:样式和值
                        Match TitleMatch = Regex.Match(priceItem, @"<li style=""color:#555555; float:left; display:block; 140px; height:22px; line-height:22px;"" align=""center"">([^<]*)</li>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
           //取上面一行中的只有属性的值Value.Groups[1],1 代表Regex.Match方法得到的Groups的索引是从1开始的,而不是从0开始的
                        string name = TitleMatch.Groups[1].Value;

                        //"color:#555555; float:left; display:block; 140px; height:22px; line-height:22px;" align="center">铔嬮浮閰嶅悎楗叉枡
                        //name = name.Substring(10, name.Length - 15);
                        //name = name.Substring(113, name.Length - 118);

                        //string price = Regex.Match(priceItem, @"<li align=""center"" style=""color:#555555; float:left; display:block; 100px; height:22px; line-height:22px;"">(.*?)</li>").Value;
                        //price = price.Substring(13, price.Length - 18);
                        //price = price.Substring(115, price.Length -120);
                        Match priceMatch = Regex.Match(priceItem, @"<li align=""center"" style=""color:#555555; float:left; display:block; 100px; height:22px; line-height:22px;"">([^<]*)</li>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                        string price = priceMatch.Groups[1].Value;
    //                    string weeks = Regex.Match(priceItem, @"<li style=""color:#555555; float:left; display:block; 50px;text-align:center; height:22px; line-height:22px;"">(.*?)</li>
    //").Value;
    //                    //weeks = weeks.Substring(9, weeks.Length - 16);
    //                    weeks = weeks.Substring(116, weeks.Length - 122);

                        Match weeksMatch = Regex.Match(priceItem, @"<li style=""color:#555555; float:left; display:block; 50px;text-align:center; height:22px; line-height:22px;"">([^<]*)</li>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                        string weeks = weeksMatch.Groups[1].Value;
                        resultStr.Append("<td width="195" height="25" align="left">" + name + "</td><td width="70" height="25" align="center" style="text-align:right;">" + price + "</td><td height="25" align="center" style="color:#55a8ea;">" + weeks + "</td>");
                        resultStr.Append("</tr>");
                        #region 原来的
                        //resultStr.Append("<tr>");
                        //string priceItem = priceList[i].Value;
                        //string name = Regex.Match(priceItem, "width=125>.*?</td>").Value;
                        //name = name.Substring(10, name.Length - 15);
                        //string price = Regex.Match(priceItem, "<td width=50.*?</td>").Value;
                        //price = price.Substring(13, price.Length - 18);
                        //string weeks = Regex.Match(priceItem, "class=en>.*?</font>").Value;
                        //weeks = weeks.Substring(9, weeks.Length - 16);
                        //resultStr.Append("<td width="195" height="25" align="left">" + name + "</td><td width="70" height="25" align="center">" + price + "</td><td height="25" align="center" style="color:#55a8ea;">" + weeks + "</td>");
                        //resultStr.Append("</tr>");
                        #endregion
                    }
                    catch (Exception ex)
                    {
                        //Common.Log4netUtil.Log().Error("获取跨域数据错误." + ex.Message);
                    }
                }

                return resultStr.ToString();
     
     
  • 相关阅读:
    异步、+回调机制、线程queue、线程Event、协程、单线程实现遇到IO切换
    GIL、进/线程池、同/异步、阻/非阻塞
    锁——死锁——单个锁锁死
    网络编程之多线程
    后台Response和异常和日志封装、跨域问题及解决、es6的箭头函数、xadmin后台管理
    pip换源、虚拟环境搭建、
    非对称加密和对称加密的区别
    JWT、多方式登录、django缓存
    自定制频率、自动生成接口文档、JWT、自定制auth认证类
    books系列表接口、表断关联、分页器、根据IP限制频率
  • 原文地址:https://www.cnblogs.com/qiankundai/p/3794014.html
Copyright © 2011-2022 走看看