zoukankan      html  css  js  c++  java
  • C# 抓取页面table数据并分析到数据库

    //抓取数据并做分析(保存到数据库)

    public partial class Form1 : Form
    {
    DataTable dt = new DataTable();
    public Form1()
    {
    InitializeComponent();
    label.Visible = false;
    progressBar1.Visible = false;
    }

    public void Bind()
    {
    // string strStartCity = HttpUtility.UrlEncode(tbStartCity.Text, System.Text.Encoding.GetEncoding("gb2312"));
    // string strEndCtiy = HttpUtility.UrlEncode(tbEndCity.Text, System.Text.Encoding.GetEncoding("gb2312"));
    #region MyRegion
    string firstPage = "C:\\Documents and Settings\\Administrator\\桌面\\c.html";
    //string firstPage = "http://www.linkosky.com/UI/AirTicket/SingleFlightShowAllV.aspx? CT=00&JT=01&OC=SHA&DD=2010-05-12&DT=00&DC=PEK&AL=ALL&DR=true&ET=True&SPID=00015032&ORGID=15144";
    try
    {
    WebClient astoWebClient = new WebClient();
    astoWebClient.Credentials = CredentialCache.DefaultCredentials; //获取或设置用于对向Internet资源的请求进行身份验证的网络凭据。
    Byte[] pageData = astoWebClient.DownloadData(firstPage); //从指定网站下载数据
    string pageHtml = Encoding.Default.GetString(pageData); //获取的网站页面采用的是GB2312格式
    //string pageHtml = Encoding.UTF8.GetString(pageData); //获取的网站页面采用的是UTF-8格式
    pageHtml = pageHtml.Trim(); //先去掉头部多余的空格

    int m = pageHtml.IndexOf(" <!-- 去程航班表 -->"); //找出"<!--航班信息start-->"的位置
    if (m == -1)
    {
    return; //没有查找到数据,直接返回
    }
    string pageText = pageHtml.Remove(0, m + 18); //删除"<!--航班信息start-->"以上的html文本

    int n = pageText.IndexOf("<!--去程航班分页控制-->"); //找出"<!--航班信息end-->"的位置
    string keyText = pageText.Remove(n - 86); //删除"<!--航班信息end-->"以下的html文本
    GetData(keyText);
    }
    catch (WebException webEx)
    {
    MessageBox.Show(webEx.ToString());
    }
    #endregion
    }

    //分析HTML 数据
    private void GetData(string ddd)
    {
    System.Data.DataRow dr;
    dt = new DataTable();
    dt.Columns.Add(new System.Data.DataColumn("航空公司", typeof(System.String)));
    dt.Columns.Add(new System.Data.DataColumn("航班号", typeof(System.String)));
    dt.Columns.Add(new System.Data.DataColumn("机型", typeof(System.String)));
    dt.Columns.Add(new System.Data.DataColumn("起飞时间-城市", typeof(System.String)));
    dt.Columns.Add(new System.Data.DataColumn("到达时间-城市", typeof(System.String)));
    dt.Columns.Add(new System.Data.DataColumn("舱位类型", typeof(System.String)));
    dt.Columns.Add(new System.Data.DataColumn("剩余座位", typeof(System.String)));
    dt.Columns.Add(new System.Data.DataColumn("票面价", typeof(System.String)));
    dt.Columns.Add(new System.Data.DataColumn("返点", typeof(System.String)));
    dt.Columns.Add(new System.Data.DataColumn("净价", typeof(System.String)));

    string fileConent = string.Empty;
    string tableContent = string.Empty;
    string rowContent = string.Empty;
    string columnConent = string.Empty;

    string rowPatterm = @"<tr[^>]*>[\s\S]*?<\/tr>";
    string columnPattern = @"<td[^>]*>[\s\S]*?<\/td>";

    dr = dt.NewRow();
    MatchCollection rowCollection = Regex.Matches(ddd, rowPatterm, RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture); //对tr进行筛选
    for (int i = 1; i < rowCollection.Count; i++)
    {
    rowContent = rowCollection[i].Value;
    MatchCollection columnCollection = Regex.Matches(rowContent, columnPattern, RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture); //对td进行筛选
    if (i % 3 != 0)
    {
    #region 数据筛选
    if (i > 2)
    {
    if (i % 3 !=0 && i % 3 != 2)
    {
    dr = dt.NewRow();
    }
    }
    else
    {
    if (i % 2 != 0)
    {
    dr = dt.NewRow();
    }
    }
    for (int j = 0; j < columnCollection.Count; j++)
    {
    if (j < 5)
    {
    columnConent = columnCollection[j].Value;
    int iBodyStart = columnConent.IndexOf(">", 0);
    int iTableEnd = columnConent.IndexOf("</td>", iBodyStart);
    string strWeb = columnConent.Substring(iBodyStart + 1, iTableEnd - iBodyStart - 1); //获取最终数据
    if (i > 2)
    {
    if (i % 3 != 0 && i % 3 != 2)
    {
    dr[j] = strWeb;
    }
    else
    {
    dr[j + 5] = strWeb;
    }
    }
    else
    {
    if (i % 2 != 0 )
    {
    dr[j] = strWeb;
    }
    else
    {
    dr[j + 5] = strWeb;
    }
    }
    }
    }
    if (i > 2)
    {
    if ((i % 3 == 0) || (i % 3 == 2))
    {
    dt.Rows.Add(dr);
    add(dr[0].ToString(), dr[1].ToString(), dr[2].ToString(), dr[3].ToString(), dr[4].ToString(), dr[5].ToString(), dr[6].ToString(), dr[7].ToString(), dr[8].ToString(), dr[9].ToString());
    }
    }
    else
    {
    if (i % 2 == 0)
    {
    dt.Rows.Add(dr);
    add(dr[0].ToString(), dr[1].ToString(), dr[2].ToString(), dr[3].ToString(), dr[4].ToString(), dr[5].ToString(), dr[6].ToString(), dr[7].ToString(), dr[8].ToString(), dr[9].ToString());
    }
    }
    #endregion
    }
    }
    }

    //添加到数据库
    public void add(string fAirlineName,string fAirlineNo, string fAirlineType, string fsTime_City, string feTime_City, string fSeatType,string fSeatNum, string fPrice, string fBackNum, string fNetPrice)
    {
    SqlParameter[] ps = new SqlParameter[]
    {
    new SqlParameter("@fAirlineName",fAirlineName),
    new SqlParameter("@fAirlineNo",fAirlineNo),
    new SqlParameter("@fAirlineType",fAirlineType),
    new SqlParameter("@fsTime_City",fsTime_City),
    new SqlParameter("@feTime_City",feTime_City),
    new SqlParameter("@fSeatType",fSeatType),
    new SqlParameter("@fSeatNum",fSeatNum),
    new SqlParameter("@fPrice",fPrice),
    new SqlParameter("@fBackNum",fBackNum),
    new SqlParameter("@fNetPrice",fNetPrice)
    };
    try
    {
    WindowsFormsApplication1.SqlHelper.RunProcedureReturnBool("tAirline_Add", ps);
    }
    catch (System.Exception e)
    {
    throw e;
    }
    }


    --------------------- 以下数据和方法是用于在WINFROM下执行(以上部分是关键)----------------------------------

    private void btnSearch_Click(object sender, EventArgs e)
    {
    label.Text = "请稍后,系统正在解析数据...";
    label.Visible = true;
    progressBar1.Visible = true;
    btnSearch.Enabled = false;

    worker = new BackgroundWorker();
    worker.WorkerReportsProgress = true;
    worker.WorkerSupportsCancellation = true;
    worker.DoWork += new DoWorkEventHandler(worker_DoWork);
    worker.ProgressChanged += new ProgressChangedEventHandler(worker_ProgressChanged);
    worker.RunWorkerCompleted += new RunWorkerCompletedEventHandler(worker_RunWorkerCompleted);
    worker.RunWorkerAsync();
    }

    private void worker_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
    {
    try
    {
    Bind();
    if (e.Cancelled)
    {
    label.Text = "Cancelled";
    }
    else if (e.Error != null)
    {
    label.Text = "Error";
    }
    else
    {
    btnSearch.Enabled = true;
    if (dt != null && dt.Rows.Count > 0)
    {
    dataGridView1.DataSource = dt;
    }
    progressBar1.Value = 0;
    progressBar1.Visible = false;
    label.Visible = false;
    }
    }
    catch (Exception exts)
    {
    MessageBox.Show(exts.ToString());
    }
    }

    private void worker_DoWork(object sender, DoWorkEventArgs e)
    {
    MoveList((BackgroundWorker)sender, e);
    }

    private BackgroundWorker worker = null;

    private void MoveList(BackgroundWorker backgroundWorker, DoWorkEventArgs e)
    {
    for (int i = 0; i < 10; i++)
    {
    if (worker.CancellationPending)
    {
    e.Cancel = true;
    break;
    }
    else
    {
    worker.ReportProgress((i + 1) * (100 / 10), i);
    Thread.Sleep(500);
    }
    }
    }

    private void worker_ProgressChanged(object sender, ProgressChangedEventArgs e)
    {
    progressBar1.Value = e.ProgressPercentage;

    }

       最后执行结果如下图

  • 相关阅读:
    centos 新增用户, 然后他在主目录添加网站403Forbbiden
    linux 把用户加入一个组&从这个组中移除
    [THINKPHP] 温故知新之getFieldBy
    php 获取指定月份的开始结束时间
    apache 占用内存总量与每个apache进程的平均内存占用量计算
    网站并发300就很慢
    centos定时备份数据库超简单示例
    php导出excel时间错误(同一个时间戳,用date得到不同的时间)
    设置iframe 载入页面的效果跟直接打开这个页面一样
    node基础09:第2个node web服务器
  • 原文地址:https://www.cnblogs.com/wangchunming/p/2427899.html
Copyright © 2011-2022 走看看