//抓取数据并做分析(保存到数据库)
public partial class Form1 : Form
{
DataTable dt = new DataTable();
public Form1()
{
InitializeComponent();
label.Visible = false;
progressBar1.Visible = false;
}
public void Bind()
{
// string strStartCity = HttpUtility.UrlEncode(tbStartCity.Text, System.Text.Encoding.GetEncoding("gb2312"));
// string strEndCtiy = HttpUtility.UrlEncode(tbEndCity.Text, System.Text.Encoding.GetEncoding("gb2312"));
#region MyRegion
string firstPage = "C:\\Documents and Settings\\Administrator\\桌面\\c.html";
//string firstPage = "http://www.linkosky.com/UI/AirTicket/SingleFlightShowAllV.aspx? CT=00&JT=01&OC=SHA&DD=2010-05-12&DT=00&DC=PEK&AL=ALL&DR=true&ET=True&SPID=00015032&ORGID=15144";
try
{
WebClient astoWebClient = new WebClient();
astoWebClient.Credentials = CredentialCache.DefaultCredentials; //获取或设置用于对向Internet资源的请求进行身份验证的网络凭据。
Byte[] pageData = astoWebClient.DownloadData(firstPage); //从指定网站下载数据
string pageHtml = Encoding.Default.GetString(pageData); //获取的网站页面采用的是GB2312格式
//string pageHtml = Encoding.UTF8.GetString(pageData); //获取的网站页面采用的是UTF-8格式
pageHtml = pageHtml.Trim(); //先去掉头部多余的空格
int m = pageHtml.IndexOf(" <!-- 去程航班表 -->"); //找出"<!--航班信息start-->"的位置
if (m == -1)
{
return; //没有查找到数据,直接返回
}
string pageText = pageHtml.Remove(0, m + 18); //删除"<!--航班信息start-->"以上的html文本
int n = pageText.IndexOf("<!--去程航班分页控制-->"); //找出"<!--航班信息end-->"的位置
string keyText = pageText.Remove(n - 86); //删除"<!--航班信息end-->"以下的html文本
GetData(keyText);
}
catch (WebException webEx)
{
MessageBox.Show(webEx.ToString());
}
#endregion
}
//分析HTML 数据
private void GetData(string ddd)
{
System.Data.DataRow dr;
dt = new DataTable();
dt.Columns.Add(new System.Data.DataColumn("航空公司", typeof(System.String)));
dt.Columns.Add(new System.Data.DataColumn("航班号", typeof(System.String)));
dt.Columns.Add(new System.Data.DataColumn("机型", typeof(System.String)));
dt.Columns.Add(new System.Data.DataColumn("起飞时间-城市", typeof(System.String)));
dt.Columns.Add(new System.Data.DataColumn("到达时间-城市", typeof(System.String)));
dt.Columns.Add(new System.Data.DataColumn("舱位类型", typeof(System.String)));
dt.Columns.Add(new System.Data.DataColumn("剩余座位", typeof(System.String)));
dt.Columns.Add(new System.Data.DataColumn("票面价", typeof(System.String)));
dt.Columns.Add(new System.Data.DataColumn("返点", typeof(System.String)));
dt.Columns.Add(new System.Data.DataColumn("净价", typeof(System.String)));
string fileConent = string.Empty;
string tableContent = string.Empty;
string rowContent = string.Empty;
string columnConent = string.Empty;
string rowPatterm = @"<tr[^>]*>[\s\S]*?<\/tr>";
string columnPattern = @"<td[^>]*>[\s\S]*?<\/td>";
dr = dt.NewRow();
MatchCollection rowCollection = Regex.Matches(ddd, rowPatterm, RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture); //对tr进行筛选
for (int i = 1; i < rowCollection.Count; i++)
{
rowContent = rowCollection[i].Value;
MatchCollection columnCollection = Regex.Matches(rowContent, columnPattern, RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture); //对td进行筛选
if (i % 3 != 0)
{
#region 数据筛选
if (i > 2)
{
if (i % 3 !=0 && i % 3 != 2)
{
dr = dt.NewRow();
}
}
else
{
if (i % 2 != 0)
{
dr = dt.NewRow();
}
}
for (int j = 0; j < columnCollection.Count; j++)
{
if (j < 5)
{
columnConent = columnCollection[j].Value;
int iBodyStart = columnConent.IndexOf(">", 0);
int iTableEnd = columnConent.IndexOf("</td>", iBodyStart);
string strWeb = columnConent.Substring(iBodyStart + 1, iTableEnd - iBodyStart - 1); //获取最终数据
if (i > 2)
{
if (i % 3 != 0 && i % 3 != 2)
{
dr[j] = strWeb;
}
else
{
dr[j + 5] = strWeb;
}
}
else
{
if (i % 2 != 0 )
{
dr[j] = strWeb;
}
else
{
dr[j + 5] = strWeb;
}
}
}
}
if (i > 2)
{
if ((i % 3 == 0) || (i % 3 == 2))
{
dt.Rows.Add(dr);
add(dr[0].ToString(), dr[1].ToString(), dr[2].ToString(), dr[3].ToString(), dr[4].ToString(), dr[5].ToString(), dr[6].ToString(), dr[7].ToString(), dr[8].ToString(), dr[9].ToString());
}
}
else
{
if (i % 2 == 0)
{
dt.Rows.Add(dr);
add(dr[0].ToString(), dr[1].ToString(), dr[2].ToString(), dr[3].ToString(), dr[4].ToString(), dr[5].ToString(), dr[6].ToString(), dr[7].ToString(), dr[8].ToString(), dr[9].ToString());
}
}
#endregion
}
}
}
//添加到数据库
public void add(string fAirlineName,string fAirlineNo, string fAirlineType, string fsTime_City, string feTime_City, string fSeatType,string fSeatNum, string fPrice, string fBackNum, string fNetPrice)
{
SqlParameter[] ps = new SqlParameter[]
{
new SqlParameter("@fAirlineName",fAirlineName),
new SqlParameter("@fAirlineNo",fAirlineNo),
new SqlParameter("@fAirlineType",fAirlineType),
new SqlParameter("@fsTime_City",fsTime_City),
new SqlParameter("@feTime_City",feTime_City),
new SqlParameter("@fSeatType",fSeatType),
new SqlParameter("@fSeatNum",fSeatNum),
new SqlParameter("@fPrice",fPrice),
new SqlParameter("@fBackNum",fBackNum),
new SqlParameter("@fNetPrice",fNetPrice)
};
try
{
WindowsFormsApplication1.SqlHelper.RunProcedureReturnBool("tAirline_Add", ps);
}
catch (System.Exception e)
{
throw e;
}
}
--------------------- 以下数据和方法是用于在WINFROM下执行(以上部分是关键)----------------------------------
private void btnSearch_Click(object sender, EventArgs e)
{
label.Text = "请稍后,系统正在解析数据...";
label.Visible = true;
progressBar1.Visible = true;
btnSearch.Enabled = false;
worker = new BackgroundWorker();
worker.WorkerReportsProgress = true;
worker.WorkerSupportsCancellation = true;
worker.DoWork += new DoWorkEventHandler(worker_DoWork);
worker.ProgressChanged += new ProgressChangedEventHandler(worker_ProgressChanged);
worker.RunWorkerCompleted += new RunWorkerCompletedEventHandler(worker_RunWorkerCompleted);
worker.RunWorkerAsync();
}
private void worker_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
{
try
{
Bind();
if (e.Cancelled)
{
label.Text = "Cancelled";
}
else if (e.Error != null)
{
label.Text = "Error";
}
else
{
btnSearch.Enabled = true;
if (dt != null && dt.Rows.Count > 0)
{
dataGridView1.DataSource = dt;
}
progressBar1.Value = 0;
progressBar1.Visible = false;
label.Visible = false;
}
}
catch (Exception exts)
{
MessageBox.Show(exts.ToString());
}
}
private void worker_DoWork(object sender, DoWorkEventArgs e)
{
MoveList((BackgroundWorker)sender, e);
}
private BackgroundWorker worker = null;
private void MoveList(BackgroundWorker backgroundWorker, DoWorkEventArgs e)
{
for (int i = 0; i < 10; i++)
{
if (worker.CancellationPending)
{
e.Cancel = true;
break;
}
else
{
worker.ReportProgress((i + 1) * (100 / 10), i);
Thread.Sleep(500);
}
}
}
private void worker_ProgressChanged(object sender, ProgressChangedEventArgs e)
{
progressBar1.Value = e.ProgressPercentage;
}
最后执行结果如下图