zoukankan      html  css  js  c++  java
  • 正则 挖网站表格复习

    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Web;
    using System.Web.UI;
    using System.Web.UI.WebControls;
    using System.Xml;
    namespace WebApplication19
    {
        public enum SearchRange
        {
            th=0,
            td=1
        }
        public partial class WebForm1 : System.Web.UI.Page
        {
            public string MKT;
            private string getHtml()
            {
                List<string> trList = new List<string>();
                try
                {
                    WebClient wc = new WebClient();
                    using (Stream stream = wc.OpenRead("http://srh.bankofchina.com/search/whpj/search.jsp?erectDate=2001-11-01&nothing=2016-11-04&pjname=1316&page=4"))
                    {
                        using (StreamReader sr = new StreamReader(stream, Encoding.UTF8))
                        {
    
                            string content = sr.ReadToEnd();
                            //提取div内容开始
                            string divPatern = @"(?<=<div (.*)?class=""BOC_main publish""[^>]*?>)([sS]*?)(?=</div>)";
                            MatchCollection divMatches = Regex.Matches(content, divPatern);
                            string divContent = string.Empty;
                            foreach (Match match in divMatches)
                            {
                                divContent = match.Groups[0].Value;
                                break;
                            }
                            //提取div内容结束
    
                            //提取表格内容开始
                            string tablePatern = @"(?<=<table (.*)?[^>]*?>)([sS]*?)(?=</table>)";
                            MatchCollection tableMatches = Regex.Matches(divContent, tablePatern);
                            string tableContent = string.Empty;
                            foreach (Match match in tableMatches)
                            {
                                tableContent = match.Groups[0].Value;
                                break;
                            }
    
                            //提取表格内容结束
    
    
                            //提取行开始
    
                            string trPatern = @"(?<=<tr(.*)?[^>]*?>)([sS]*?)(?=</tr>)";
                            MatchCollection trMatchCollection = Regex.Matches(tableContent, trPatern);
                            for (int j = 0; j < trMatchCollection.Count; j++)
                            {
                                Match match = trMatchCollection[j];
                                string tr = string.Empty;
                                tr = match.Groups[0].Value;
                                trList.Add(tr);
    
    
                            }
                            //提取行结束
    
                        }
    
                        //获取表头列元素,或者内容行的单元格元素 trlist[0]是表头 SearchR,ange告诉程序要查表头 还是 内容行
                        List<string> thList = GET_TH_OR_TD_LIST(SearchRange.th, trList[0]);
                        System.Collections.ArrayList tdsList = new System.Collections.ArrayList();
                        for (int i = 1; i < trList.Count; i++)
                        {
                            tdsList.Add(GET_TH_OR_TD_LIST(SearchRange.td, trList[i]));
                        }
                      
                    }
                }
                catch (Exception ex)
                {
                   
                }
                return MKT;
            }
    
            private List<string> GET_TH_OR_TD_LIST(SearchRange range,string row)
            {
                string tmp = "";
                tmp = range.ToString();
                string tdPatern = $@"(?<=(<{tmp}[^>]*?>))(?<tdCell>[sS]*?)(?=</{tmp}>)";
                MatchCollection CurrenttdMatchCollection = Regex.Matches(row, tdPatern);
                string td = string.Empty;
                List<string> tdlList = new List<string>();
                List<string> contentList = new List<string>();
                foreach (Match match in CurrenttdMatchCollection)
                {
    
                    td = match.Groups["tdCell"].Value;
                    contentList.Add(td);
    
                }
                return contentList;
    
            }
            protected void Page_Load(object sender, EventArgs e)
            {
                getHtml();
            }
        }
    }
  • 相关阅读:
    【SAS NOTE】OUTPUT
    【SAS NOTES】_NULL_
    【SAS NOTE】sas 9.2 安装
    【SAS NOTE】FREQ
    纯数学教程 Page 203 例XLI (1)
    纯数学教程 Page 203 例XLI (3)
    纯数学教程 Page 203 例XLI (2)
    Prove Cauchy's inequality by induction
    纯数学教程 Page 325 例LXVIII (15) 调和级数发散
    纯数学教程 Page 325 例LXVIII (15) 调和级数发散
  • 原文地址:https://www.cnblogs.com/kexb/p/6035938.html
Copyright © 2011-2022 走看看