zoukankan      html  css  js  c++  java
  • 正则 挖网站表格复习

    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Web;
    using System.Web.UI;
    using System.Web.UI.WebControls;
    using System.Xml;
    namespace WebApplication19
    {
        public enum SearchRange
        {
            th=0,
            td=1
        }
        public partial class WebForm1 : System.Web.UI.Page
        {
            public string MKT;
            private string getHtml()
            {
                List<string> trList = new List<string>();
                try
                {
                    WebClient wc = new WebClient();
                    using (Stream stream = wc.OpenRead("http://srh.bankofchina.com/search/whpj/search.jsp?erectDate=2001-11-01&nothing=2016-11-04&pjname=1316&page=4"))
                    {
                        using (StreamReader sr = new StreamReader(stream, Encoding.UTF8))
                        {
    
                            string content = sr.ReadToEnd();
                            //提取div内容开始
                            string divPatern = @"(?<=<div (.*)?class=""BOC_main publish""[^>]*?>)([sS]*?)(?=</div>)";
                            MatchCollection divMatches = Regex.Matches(content, divPatern);
                            string divContent = string.Empty;
                            foreach (Match match in divMatches)
                            {
                                divContent = match.Groups[0].Value;
                                break;
                            }
                            //提取div内容结束
    
                            //提取表格内容开始
                            string tablePatern = @"(?<=<table (.*)?[^>]*?>)([sS]*?)(?=</table>)";
                            MatchCollection tableMatches = Regex.Matches(divContent, tablePatern);
                            string tableContent = string.Empty;
                            foreach (Match match in tableMatches)
                            {
                                tableContent = match.Groups[0].Value;
                                break;
                            }
    
                            //提取表格内容结束
    
    
                            //提取行开始
    
                            string trPatern = @"(?<=<tr(.*)?[^>]*?>)([sS]*?)(?=</tr>)";
                            MatchCollection trMatchCollection = Regex.Matches(tableContent, trPatern);
                            for (int j = 0; j < trMatchCollection.Count; j++)
                            {
                                Match match = trMatchCollection[j];
                                string tr = string.Empty;
                                tr = match.Groups[0].Value;
                                trList.Add(tr);
    
    
                            }
                            //提取行结束
    
                        }
    
                        //获取表头列元素,或者内容行的单元格元素 trlist[0]是表头 SearchR,ange告诉程序要查表头 还是 内容行
                        List<string> thList = GET_TH_OR_TD_LIST(SearchRange.th, trList[0]);
                        System.Collections.ArrayList tdsList = new System.Collections.ArrayList();
                        for (int i = 1; i < trList.Count; i++)
                        {
                            tdsList.Add(GET_TH_OR_TD_LIST(SearchRange.td, trList[i]));
                        }
                      
                    }
                }
                catch (Exception ex)
                {
                   
                }
                return MKT;
            }
    
            private List<string> GET_TH_OR_TD_LIST(SearchRange range,string row)
            {
                string tmp = "";
                tmp = range.ToString();
                string tdPatern = $@"(?<=(<{tmp}[^>]*?>))(?<tdCell>[sS]*?)(?=</{tmp}>)";
                MatchCollection CurrenttdMatchCollection = Regex.Matches(row, tdPatern);
                string td = string.Empty;
                List<string> tdlList = new List<string>();
                List<string> contentList = new List<string>();
                foreach (Match match in CurrenttdMatchCollection)
                {
    
                    td = match.Groups["tdCell"].Value;
                    contentList.Add(td);
    
                }
                return contentList;
    
            }
            protected void Page_Load(object sender, EventArgs e)
            {
                getHtml();
            }
        }
    }
  • 相关阅读:
    Java:Socket通信
    菜鸟玩云计算之十八:Hadoop 2.5.0 HA 集群安装第1章
    tolua reference
    严格符合CommonJS规范的包特性
    C++第11周(春)项目3
    Android动态逆向分析工具ZjDroid--脱壳神器
    报文格式【定长报文】
    OC3大回调模式使用总结(三)block回调
    Qt creator 编译错误 :cannot find file .pro qt
    OpenCV【2】---读取png图片显示到QT label上的问题
  • 原文地址:https://www.cnblogs.com/kexb/p/6035938.html
Copyright © 2011-2022 走看看