zoukankan      html  css  js  c++  java
  • 抓取网页(table)信息(导出到数据库中、excel中)

    1、存储数据到数据库中:

      1)获取网页中table中的列名,作为数据库表的列名(@"<th scope=""col"">";和"</th>";是table中的包括列名的标签)

    /// <summary>
            /// 获取列名
            /// </summary>
            /// <param name="sbHtmltext"></param>
            public void GetColomnNumAndName(string sbHtmltext)
            {
                int i = 0;
                for (i = 0; ; i++)
                {
                    string ColomnName = "";
                    string StartColoumnName = @"<th scope=""col"">";
                    string EndColoumnName = "</th>";
                    int StartColoumnNameIndex = sbHtmltext.IndexOf(StartColoumnName);
                    if (StartColoumnNameIndex > -1)
                    {
                        int EndColoumnNameIndex = sbHtmltext.IndexOf(EndColoumnName);
                        int longs = EndColoumnNameIndex - StartColoumnNameIndex - StartColoumnName.Length;
                        ColomnName = sbHtmltext.Substring(StartColoumnNameIndex + StartColoumnName.Length, longs);
                        dt.Columns.Add(ColomnName.ToString().Trim());
                        sbHtmltext = sbHtmltext.Remove(0, (EndColoumnNameIndex + EndColoumnName.Length));
                    }
                    else
                    {
                        break;
                    }
                }
            }
    

      2)获取table中的数据

    /// <summary>
            /// 获取数据
            /// </summary>
            /// <param name="sbHtmltext"></param>
            public void GetValue(string sbHtmltext)
            {
                //去掉页面头
                string StartColoumnName = @"<table cellspacing=""0"" rules=""all"" border=""1"" id=""ctl00_ContentPlaceHolder1_gvChanpin"" style=""border-collapse:collapse;"">";
                string EndColoumnName = @"<td colspan=""14""><table border=""0"">";
                string StartValue = "<td>";
                string EndValue = "</td>";
                int StartColoumnNameIndex = sbHtmltext.IndexOf(StartColoumnName);
                int EndColoumnNameIndex = sbHtmltext.IndexOf(EndColoumnName);
                int longs = EndColoumnNameIndex - StartColoumnNameIndex - StartColoumnName.Length;
                string sbHtmltextChild = sbHtmltext.Substring(StartColoumnNameIndex + StartColoumnName.Length, longs);//删除不包含数据的上下元素
                for (int i = 0; ; i++)
                {
                    string Value = "";
                    int StartChild = sbHtmltextChild.IndexOf(StartValue);
                    if (StartChild > -1)
                    {
                        DataRow dr = dt.NewRow();
                        for (int j = 0; ; j++)
                        {
                            int StartValueIndex = sbHtmltextChild.IndexOf(StartValue);
                            if (StartValueIndex > -1 && StartValueIndex != 18)
                            {
                                int EndValueIndex = sbHtmltextChild.IndexOf(EndValue);
                                int longsChild = EndValueIndex - StartValueIndex - StartValue.Length;
                                Value = sbHtmltextChild.Substring(StartValueIndex + StartValue.Length, longsChild).Replace("\r\n", "").Replace("\t", "").Replace("\"", """).Trim();
                                dr[j] = Value.ToString().Trim();
                                sbHtmltextChild = sbHtmltextChild.Remove(0, (EndValueIndex + EndValue.Length));
                            }
                            else
                            {
                                break;
                            }
                        }
                        dt.Rows.Add(dr);
                        int removelen = sbHtmltextChild.IndexOf(StartValue);
                        if (removelen > -1)
                        {
                            sbHtmltextChild = sbHtmltextChild.Remove(0, removelen);
                        }
                    }
                    else
                    {
                        break;
                    }
                }
            }
    

        3)button按钮是触发事件,调用方法:

     try
                {
                    if (txttablename.Text== "")
                    {
                        MessageBox.Show("请输入表名!");
                        return;
                    }
                    string sbHtmltext = WBdata.DocumentText;//获取所有页面元素
                    GetColomnNumAndName(sbHtmltext);//获取列名
                GetValue(sbHtmltext);//获取数据            
                    if (dt != null && dt.Rows.Count > 0)
                        {
                            string descTableName = txttablename.Text.Trim();
                            string str = bll.SourceExcel_Import_Web(descTableName, dt);
                            dt.Reset();//重置数据容器DataTable
                            MessageBox.Show(str);
    
                            // 定位控件webBrowser滚动条的位置
                            HtmlDocument document = this.WBdata.Document;
                            document.Window.ScrollTo(750, 1200);
                        }
                        else
                        {
                            MessageBox.Show("未采集到数据!");
                        }
                }
                    catch (Exception ex)
                    {
                        MessageBox.Show(ex.Message);
                    }
    

      4)DAL层功能的实现

    /// 导入数据
            /// <summary>
            /// </summary>
            /// <param name="descTableName"></param>
            /// <param name="dtSource"></param>
            /// <returns></returns>
            public string SourceExcel_Import_Web(string descTableName, DataTable dtSource)
            {
                string succ = "";
                try
                {
                    string column = "";
                    for (int i = 0; i < dtSource.Columns.Count; i++)
                    {
                        if (i == 0)
                        {
                            column += dtSource.Columns[i].ColumnName + " varchar(max)";
                        }
                        else
                        {
                            column += "," + dtSource.Columns[i].ColumnName + " varchar(max)";
                        }
                    }
                  
                    string strsql = "if not exists(select * from sys.objects where name ='" + descTableName + "')begin create table " + descTableName + "(" + column + ")end ";
                    HZ.Data.DbHelperFactory.Create(connectWeb).ExecuteNonQuery(strsql);
              //HZ.Data.DbHelperFactory.Create(connectWeb).ExecuteNonQuery(strsql)是封装好的DBHelper
    SqlBulkCopyOptions sqlBulkCopyOptions; sqlBulkCopyOptions = SqlBulkCopyOptions.FireTriggers; SqlBulkCopy DTS = new System.Data.SqlClient.SqlBulkCopy(connectWeb, sqlBulkCopyOptions); DTS.NotifyAfter = 1; DTS.DestinationTableName = descTableName; DTS.BulkCopyTimeout = 60000000; DTS.WriteToServer(dtSource); succ = "保存成功!" + dtSource.Rows.Count + "条数据"; } catch (Exception ex) { succ = ex.Message; } return succ; } }

      总结:应该先在数据库中手动的创建一个数据库,表名是手动输入的。。

    2、导出excel

      1)导出excel的方法。(common.file)

     private static Form _openerForm;
            /// <summary>
            /// 所属窗体
            /// </summary>
            public static Form OpenerForm
            {
                set { _openerForm = value; }
                get { return _openerForm; }
            }
    
            /// <summary>
            /// 导出到excel
            /// </summary>
            /// <param name="dataTable"></param>
            /// <returns></returns>
            public static bool ExportDataTableToExcel(System.Data.DataTable dataTable)
            {
                SaveFileDialog saveFileDialog = new SaveFileDialog();
                saveFileDialog.Filter = "Execl files (*.xls)|*.xls";
                saveFileDialog.FilterIndex = 0;
                saveFileDialog.RestoreDirectory = true;
                saveFileDialog.CreatePrompt = false;
                saveFileDialog.Title = "导出Excel文件到";
    
                if (saveFileDialog.ShowDialog() == DialogResult.Cancel)
                {
                    return false;
                }
    
                excel9.Application oXL;
                excel9._Workbook oWB;
                excel9._Worksheet oSheet;
                excel9.Range oRng;
    
                try
                {
                    oXL = new excel9.Application();
                    oXL.Visible = false;
    
                    oWB = (excel9._Workbook)(oXL.Workbooks.Add(Missing.Value));
                    oSheet = (excel9._Worksheet)oWB.ActiveSheet;
    
                    int _RowCount = dataTable.Rows.Count;
                    int _ColumnCount = dataTable.Columns.Count;
    
                    for (int i = 0; i < _ColumnCount; i++)
                    {
                        oSheet.Cells[1, i + 1] = dataTable.Columns[i].Caption;
                    }
    
                    for (int j = 0; j < _RowCount; j++)
                    {
                        for (int k = 0; k < _ColumnCount; k++)
                        {
                            oSheet.Cells[j + 2, k + 1] = dataTable.Rows[j].ItemArray[k].ToString();
                        }
                    }
    
                    string ExcelHeader = GetExcelHeader(_ColumnCount);
    
                    oWB.SaveAs
                    (
                        saveFileDialog.FileName,
                        excel9.XlFileFormat.xlExcel7,
                        Type.Missing,
                        Type.Missing,
                        Type.Missing,
                        Type.Missing,
                        excel9.XlSaveAsAccessMode.xlExclusive,
                        Type.Missing,
                        Type.Missing,
                        Type.Missing,
                        Type.Missing
                    );
    
                    oWB.Close(null, null, null);
                    oXL.Workbooks.Close();
                    oXL.Quit();
    
                    return true;
                }
                catch (Exception e)
                {
                    MessageBox.Show(e.ToString());
                    return false;
                }
                finally
                {
                }
            }
    

      

      private static string GetExcelHeader(int number)
            {
                string[] Header =
                    new string[] 
                    { 
                        " ", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", 
                        "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "W", 
                        "Z" 
                    };
                if (number <= 26)
                {
                    return Header[number];
                }
                else
                {
                    return GetExcelHeader(number / 26) + Header[number % 26];
                }
            }
    

        2)button按钮下的方法

          try
                {               
                    string sbHtmltext = WBdata.DocumentText;//获取所有页面元素
                  GetColomnNumAndName(sbHtmltext);//获取列名
                    GetValue(sbHtmltext);//获取数据            
                    //判断导出是否成功
                    if (common.File.ExportDataTableToExcel(dt))
                    {
                        MessageBox.Show("导出成功!");
                        dt.Reset();
    
                        // 定位控件webBrowser滚动条的位置
                        HtmlDocument document = this.WBdata.Document;
                        document.Window.ScrollTo(750, 1000);
                    }
                    else
                    {
                        MessageBox.Show("导出数据失败!");
                    }
                }
                catch(Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }
            
    

      

  • 相关阅读:
    android之app widget(二)
    使用作业异步调用存储过程的示例
    简单两步走 中兴V880获取权限方法
    3D化网页工具:Tilt
    在各种处理中应用排序规则的示例
    GreenSock推出了新一代动画引擎平台GSAP v12
    Derived_Tables_And_Correlated_Subqueries
    六大浏览器帐号管理功能大PK
    Entity Framework 5 Sample Provider简介
    中兴U880刷recovery与刷机详细教程
  • 原文地址:https://www.cnblogs.com/SnailWalk/p/2965492.html
Copyright © 2011-2022 走看看