提取http://www.codefans.net/jscss/code/1866.shtm等l类似网页中运行区块的html代码 并保存到本地.
应该是将 源码爱好者 » 网页特效代码 下面的子目录都抓下来了, 我机器上运行了15min抓了有10几个子目录 共4M多
用到了正则表达式,文件读取保存, 多线程
是用vs2005写的, winform
form1.cs
代码
/**************************************************************************************************************
* 本程序多线程从特定网页中提取一块内容
* 具体从http://www.codefans.net/jscss/code/1866.shtml提取中间演示textarea内的html文本
* 从(网页特效代码)->(详细分类)->html网页中textarea内容
*
* 程序内使用了1.多线程 2.正则表达式 3.web文件读取 4.本地文件保存及编码问题
*
*
*
* 线程挂起没实现 好像用ThreadPool可以暂停纯种线程
*
*
*
*
*
***************************************************************************************************************/
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Threading;
namespace dig
{
public partial class form1 : Form
{
private Thread getFileThread = null;
private DateTime startTime = DateTime.Now;
private string strCurUrl = "";//当前处理的url
private string strSaveUrl = "";//当前保存网页的url
public form1()
{
InitializeComponent();
}
// http://www.codefans.net/jscss/code/1866.shtml
private void btnStart_Click(object sender, EventArgs e)
{
if (getFileThread == null)
{
getFileThread = new Thread(new ThreadStart(GetFileAndSave));//新建一个线程
getFileThread.Start();//线程开始
}
}
private void GetFileAndSave()
{
for (int i = 0; i < 9999; ++i)
{
/*string strI = i + "";
while (strI.Length < 4)
{
strI = "0" + strI;
}*/
CreateHtmlPage(@"http://www.codefans.net/jscss/code/" + i + ".shtml");
//Console.WriteLine(strI);
}
}
private void CreateHtmlPage(string strUrl)
{
try
{
//正在处理的url
strCurUrl = strUrl;
//读取文件
HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(strUrl);
HttpWebResponse myResp = (HttpWebResponse)myReq.GetResponse();
StreamReader respStream = new StreamReader(myResp.GetResponseStream(), Encoding.Default);
string respStr = respStream.ReadToEnd();
respStream.Close();
//得到文件名 以文件标题为文件名
string strReg = @"(?<=(<title>)).*(?=_源码爱好者</title>)";
string strFileName = new Regex(strReg).Match(respStr).ToString();
//得到文件夹名 从"网页特效代码"后面取100个字符分析
int iTemp = respStr.IndexOf("网页特效代码");
string strFloderName = respStr.Substring(iTemp, 100);
strReg = @"(?<=(<a.*>)).*(?=</a>)";
strFloderName = new Regex(strReg, RegexOptions.IgnoreCase).Match(strFloderName).ToString();
//取出<textarea></textarea>之间的字符
strReg = @"(?<=(<textarea.*?>))([\w\W]*)(?=</textarea>)";//?<=表示左环视 不包()里面东东 .表示任何字符除了/n *?表示尽可能少的(好像是lazy) ?=右环视 不包括内容
Match match = new Regex(strReg).Match(respStr);
//将"替换成"
strReg = @""";
respStr = new Regex(strReg).Replace(match.ToString(), "\"");
//将>替换成>
/*
strReg = @">";
respStr = new Regex(strReg).Replace(match.ToString(), ">");
//将<替换成<
strReg = @"<";
respStr = new Regex(strReg).Replace(match.ToString(), "<");
*/
respStr = respStr.Replace(""", "\"");
respStr = respStr.Replace("<", "<");
respStr = respStr.Replace(">", ">");
//写入文件
string path = SaveFile(respStr, strFileName, strFloderName);
//处理完的url
strSaveUrl = path + "\\" + strFileName + ".html";
}
catch
{
StreamWriter sw = new StreamWriter(@"c:\error.txt", true, System.Text.Encoding.GetEncoding("gb2312"));//将不能读取的文件url写进txt文档
sw.Write(strUrl);
sw.Flush();
sw.Close();
this.strSaveUrl = "读取远程url失败, 未能保存";
}
}
private static string SaveFile(string str, string strFileName, string strFloderName)
{
string path = @"c:\" + @"网页特效代码\" + strFloderName;
if (!Directory.Exists(path))
Directory.CreateDirectory(path);
StreamWriter sw = new StreamWriter(path + "\\" + strFileName + ".html", true, System.Text.Encoding.GetEncoding("gb2312"));//System.Text.Encoding.Default;
sw.Write(str);
sw.Flush();
sw.Close();
return path;
/*TextWriter myWriter = File.CreateText(path + "\\" + strFileName + ".html");//file只能以utf-8写入
myWriter.Write(respStr);
myWriter.Flush();
myWriter.Close();*/
}
private void timer1_Tick(object sender, EventArgs e)
{
this.lblTime.Text = DateTime.Now.ToLocalTime().ToString();
TimeSpan span = DateTime.Now.Subtract(startTime);
this.lblTimeElapsed.Text = span.Seconds.ToString();
this.txtUrl.Text = strCurUrl;
this.txtSaveUrl.Text = strSaveUrl;
}
private void btnStop_Click(object sender, EventArgs e)
{
getFileThread.Abort();//结束线程
}
private void btnPause_Click(object sender, EventArgs e)
{
//getFileThread.Suspend();//线程挂起
}
private void form1_FormClosed(object sender, FormClosedEventArgs e)
{
if(getFileThread != null)
getFileThread.Abort();
}
}
}
* 本程序多线程从特定网页中提取一块内容
* 具体从http://www.codefans.net/jscss/code/1866.shtml提取中间演示textarea内的html文本
* 从(网页特效代码)->(详细分类)->html网页中textarea内容
*
* 程序内使用了1.多线程 2.正则表达式 3.web文件读取 4.本地文件保存及编码问题
*
*
*
* 线程挂起没实现 好像用ThreadPool可以暂停纯种线程
*
*
*
*
*
***************************************************************************************************************/
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Threading;
namespace dig
{
public partial class form1 : Form
{
private Thread getFileThread = null;
private DateTime startTime = DateTime.Now;
private string strCurUrl = "";//当前处理的url
private string strSaveUrl = "";//当前保存网页的url
public form1()
{
InitializeComponent();
}
// http://www.codefans.net/jscss/code/1866.shtml
private void btnStart_Click(object sender, EventArgs e)
{
if (getFileThread == null)
{
getFileThread = new Thread(new ThreadStart(GetFileAndSave));//新建一个线程
getFileThread.Start();//线程开始
}
}
private void GetFileAndSave()
{
for (int i = 0; i < 9999; ++i)
{
/*string strI = i + "";
while (strI.Length < 4)
{
strI = "0" + strI;
}*/
CreateHtmlPage(@"http://www.codefans.net/jscss/code/" + i + ".shtml");
//Console.WriteLine(strI);
}
}
private void CreateHtmlPage(string strUrl)
{
try
{
//正在处理的url
strCurUrl = strUrl;
//读取文件
HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(strUrl);
HttpWebResponse myResp = (HttpWebResponse)myReq.GetResponse();
StreamReader respStream = new StreamReader(myResp.GetResponseStream(), Encoding.Default);
string respStr = respStream.ReadToEnd();
respStream.Close();
//得到文件名 以文件标题为文件名
string strReg = @"(?<=(<title>)).*(?=_源码爱好者</title>)";
string strFileName = new Regex(strReg).Match(respStr).ToString();
//得到文件夹名 从"网页特效代码"后面取100个字符分析
int iTemp = respStr.IndexOf("网页特效代码");
string strFloderName = respStr.Substring(iTemp, 100);
strReg = @"(?<=(<a.*>)).*(?=</a>)";
strFloderName = new Regex(strReg, RegexOptions.IgnoreCase).Match(strFloderName).ToString();
//取出<textarea></textarea>之间的字符
strReg = @"(?<=(<textarea.*?>))([\w\W]*)(?=</textarea>)";//?<=表示左环视 不包()里面东东 .表示任何字符除了/n *?表示尽可能少的(好像是lazy) ?=右环视 不包括内容
Match match = new Regex(strReg).Match(respStr);
//将"替换成"
strReg = @""";
respStr = new Regex(strReg).Replace(match.ToString(), "\"");
//将>替换成>
/*
strReg = @">";
respStr = new Regex(strReg).Replace(match.ToString(), ">");
//将<替换成<
strReg = @"<";
respStr = new Regex(strReg).Replace(match.ToString(), "<");
*/
respStr = respStr.Replace(""", "\"");
respStr = respStr.Replace("<", "<");
respStr = respStr.Replace(">", ">");
//写入文件
string path = SaveFile(respStr, strFileName, strFloderName);
//处理完的url
strSaveUrl = path + "\\" + strFileName + ".html";
}
catch
{
StreamWriter sw = new StreamWriter(@"c:\error.txt", true, System.Text.Encoding.GetEncoding("gb2312"));//将不能读取的文件url写进txt文档
sw.Write(strUrl);
sw.Flush();
sw.Close();
this.strSaveUrl = "读取远程url失败, 未能保存";
}
}
private static string SaveFile(string str, string strFileName, string strFloderName)
{
string path = @"c:\" + @"网页特效代码\" + strFloderName;
if (!Directory.Exists(path))
Directory.CreateDirectory(path);
StreamWriter sw = new StreamWriter(path + "\\" + strFileName + ".html", true, System.Text.Encoding.GetEncoding("gb2312"));//System.Text.Encoding.Default;
sw.Write(str);
sw.Flush();
sw.Close();
return path;
/*TextWriter myWriter = File.CreateText(path + "\\" + strFileName + ".html");//file只能以utf-8写入
myWriter.Write(respStr);
myWriter.Flush();
myWriter.Close();*/
}
private void timer1_Tick(object sender, EventArgs e)
{
this.lblTime.Text = DateTime.Now.ToLocalTime().ToString();
TimeSpan span = DateTime.Now.Subtract(startTime);
this.lblTimeElapsed.Text = span.Seconds.ToString();
this.txtUrl.Text = strCurUrl;
this.txtSaveUrl.Text = strSaveUrl;
}
private void btnStop_Click(object sender, EventArgs e)
{
getFileThread.Abort();//结束线程
}
private void btnPause_Click(object sender, EventArgs e)
{
//getFileThread.Suspend();//线程挂起
}
private void form1_FormClosed(object sender, FormClosedEventArgs e)
{
if(getFileThread != null)
getFileThread.Abort();
}
}
}
form1.desginer.cs
代码
namespace dig
{
partial class form1
{
/// <summary>
/// 必需的设计器变量。
/// </summary>
private System.ComponentModel.IContainer components = null;
/// <summary>
/// 清理所有正在使用的资源。
/// </summary>
/// <param name="disposing">如果应释放托管资源,为 true;否则为 false。</param>
protected override void Dispose(bool disposing)
{
if (disposing && (components != null))
{
components.Dispose();
}
base.Dispose(disposing);
}
#region Windows 窗体设计器生成的代码
/// <summary>
/// 设计器支持所需的方法 - 不要
/// 使用代码编辑器修改此方法的内容。
/// </summary>
private void InitializeComponent()
{
this.components = new System.ComponentModel.Container();
this.btnStart = new System.Windows.Forms.Button();
this.txtUrl = new System.Windows.Forms.TextBox();
this.label1 = new System.Windows.Forms.Label();
this.label2 = new System.Windows.Forms.Label();
this.txtSaveUrl = new System.Windows.Forms.TextBox();
this.timer1 = new System.Windows.Forms.Timer(this.components);
this.label3 = new System.Windows.Forms.Label();
this.lblTime = new System.Windows.Forms.Label();
this.label4 = new System.Windows.Forms.Label();
this.lblTimeElapsed = new System.Windows.Forms.Label();
this.btnStop = new System.Windows.Forms.Button();
this.btnPause = new System.Windows.Forms.Button();
this.SuspendLayout();
//
// btnStart
//
this.btnStart.Location = new System.Drawing.Point(97, 263);
this.btnStart.Name = "btnStart";
this.btnStart.Size = new System.Drawing.Size(75, 23);
this.btnStart.TabIndex = 0;
this.btnStart.Text = "&Start";
this.btnStart.UseVisualStyleBackColor = true;
this.btnStart.Click += new System.EventHandler(this.btnStart_Click);
//
// txtUrl
//
this.txtUrl.Location = new System.Drawing.Point(95, 20);
this.txtUrl.Name = "txtUrl";
this.txtUrl.ReadOnly = true;
this.txtUrl.Size = new System.Drawing.Size(326, 21);
this.txtUrl.TabIndex = 1;
this.txtUrl.Text = "http://www.codefans.net/jscss/code/1866.shtml";
//
// label1
//
this.label1.AutoSize = true;
this.label1.Location = new System.Drawing.Point(30, 23);
this.label1.Name = "label1";
this.label1.Size = new System.Drawing.Size(59, 12);
this.label1.TabIndex = 2;
this.label1.Text = "远程文件:";
//
// label2
//
this.label2.AutoSize = true;
this.label2.Location = new System.Drawing.Point(30, 82);
this.label2.Name = "label2";
this.label2.Size = new System.Drawing.Size(59, 12);
this.label2.TabIndex = 3;
this.label2.Text = "保存位置:";
//
// txtSaveUrl
//
this.txtSaveUrl.Location = new System.Drawing.Point(97, 79);
this.txtSaveUrl.Name = "txtSaveUrl";
this.txtSaveUrl.ReadOnly = true;
this.txtSaveUrl.Size = new System.Drawing.Size(326, 21);
this.txtSaveUrl.TabIndex = 1;
this.txtSaveUrl.Text = "http://www.codefans.net/jscss/code/1866.shtml";
//
// timer1
//
this.timer1.Enabled = true;
this.timer1.Interval = 1000;
this.timer1.Tick += new System.EventHandler(this.timer1_Tick);
//
// label3
//
this.label3.AutoSize = true;
this.label3.Location = new System.Drawing.Point(30, 141);
this.label3.Name = "label3";
this.label3.Size = new System.Drawing.Size(59, 12);
this.label3.TabIndex = 4;
this.label3.Text = "当前时间:";
//
// lblTime
//
this.lblTime.AutoSize = true;
this.lblTime.Location = new System.Drawing.Point(95, 141);
this.lblTime.Name = "lblTime";
this.lblTime.Size = new System.Drawing.Size(35, 12);
this.lblTime.TabIndex = 5;
this.lblTime.Text = "12:00";
//
// label4
//
this.label4.AutoSize = true;
this.label4.Location = new System.Drawing.Point(30, 200);
this.label4.Name = "label4";
this.label4.Size = new System.Drawing.Size(59, 12);
this.label4.TabIndex = 4;
this.label4.Text = "共用时间:";
//
// lblTimeElapsed
//
this.lblTimeElapsed.AutoSize = true;
this.lblTimeElapsed.Location = new System.Drawing.Point(95, 201);
this.lblTimeElapsed.Name = "lblTimeElapsed";
this.lblTimeElapsed.Size = new System.Drawing.Size(11, 12);
this.lblTimeElapsed.TabIndex = 5;
this.lblTimeElapsed.Text = "0";
//
// btnStop
//
this.btnStop.Location = new System.Drawing.Point(290, 263);
this.btnStop.Name = "btnStop";
this.btnStop.Size = new System.Drawing.Size(75, 23);
this.btnStop.TabIndex = 0;
this.btnStop.Text = "S&top";
this.btnStop.UseVisualStyleBackColor = true;
this.btnStop.Click += new System.EventHandler(this.btnStop_Click);
//
// btnPause
//
this.btnPause.Location = new System.Drawing.Point(194, 263);
this.btnPause.Name = "btnPause";
this.btnPause.Size = new System.Drawing.Size(75, 23);
this.btnPause.TabIndex = 0;
this.btnPause.Text = "&Pause";
this.btnPause.UseVisualStyleBackColor = true;
this.btnPause.Click += new System.EventHandler(this.btnPause_Click);
//
// form1
//
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 12F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.ClientSize = new System.Drawing.Size(442, 313);
this.Controls.Add(this.lblTimeElapsed);
this.Controls.Add(this.lblTime);
this.Controls.Add(this.label4);
this.Controls.Add(this.label3);
this.Controls.Add(this.label2);
this.Controls.Add(this.label1);
this.Controls.Add(this.txtSaveUrl);
this.Controls.Add(this.txtUrl);
this.Controls.Add(this.btnStop);
this.Controls.Add(this.btnPause);
this.Controls.Add(this.btnStart);
this.Name = "form1";
this.Text = "提取网页";
this.FormClosed += new System.Windows.Forms.FormClosedEventHandler(this.form1_FormClosed);
this.ResumeLayout(false);
this.PerformLayout();
}
#endregion
private System.Windows.Forms.Button btnStart;
private System.Windows.Forms.TextBox txtUrl;
private System.Windows.Forms.Label label1;
private System.Windows.Forms.Label label2;
private System.Windows.Forms.TextBox txtSaveUrl;
private System.Windows.Forms.Timer timer1;
private System.Windows.Forms.Label label3;
private System.Windows.Forms.Label lblTime;
private System.Windows.Forms.Label label4;
private System.Windows.Forms.Label lblTimeElapsed;
private System.Windows.Forms.Button btnStop;
private System.Windows.Forms.Button btnPause;
}
}
{
partial class form1
{
/// <summary>
/// 必需的设计器变量。
/// </summary>
private System.ComponentModel.IContainer components = null;
/// <summary>
/// 清理所有正在使用的资源。
/// </summary>
/// <param name="disposing">如果应释放托管资源,为 true;否则为 false。</param>
protected override void Dispose(bool disposing)
{
if (disposing && (components != null))
{
components.Dispose();
}
base.Dispose(disposing);
}
#region Windows 窗体设计器生成的代码
/// <summary>
/// 设计器支持所需的方法 - 不要
/// 使用代码编辑器修改此方法的内容。
/// </summary>
private void InitializeComponent()
{
this.components = new System.ComponentModel.Container();
this.btnStart = new System.Windows.Forms.Button();
this.txtUrl = new System.Windows.Forms.TextBox();
this.label1 = new System.Windows.Forms.Label();
this.label2 = new System.Windows.Forms.Label();
this.txtSaveUrl = new System.Windows.Forms.TextBox();
this.timer1 = new System.Windows.Forms.Timer(this.components);
this.label3 = new System.Windows.Forms.Label();
this.lblTime = new System.Windows.Forms.Label();
this.label4 = new System.Windows.Forms.Label();
this.lblTimeElapsed = new System.Windows.Forms.Label();
this.btnStop = new System.Windows.Forms.Button();
this.btnPause = new System.Windows.Forms.Button();
this.SuspendLayout();
//
// btnStart
//
this.btnStart.Location = new System.Drawing.Point(97, 263);
this.btnStart.Name = "btnStart";
this.btnStart.Size = new System.Drawing.Size(75, 23);
this.btnStart.TabIndex = 0;
this.btnStart.Text = "&Start";
this.btnStart.UseVisualStyleBackColor = true;
this.btnStart.Click += new System.EventHandler(this.btnStart_Click);
//
// txtUrl
//
this.txtUrl.Location = new System.Drawing.Point(95, 20);
this.txtUrl.Name = "txtUrl";
this.txtUrl.ReadOnly = true;
this.txtUrl.Size = new System.Drawing.Size(326, 21);
this.txtUrl.TabIndex = 1;
this.txtUrl.Text = "http://www.codefans.net/jscss/code/1866.shtml";
//
// label1
//
this.label1.AutoSize = true;
this.label1.Location = new System.Drawing.Point(30, 23);
this.label1.Name = "label1";
this.label1.Size = new System.Drawing.Size(59, 12);
this.label1.TabIndex = 2;
this.label1.Text = "远程文件:";
//
// label2
//
this.label2.AutoSize = true;
this.label2.Location = new System.Drawing.Point(30, 82);
this.label2.Name = "label2";
this.label2.Size = new System.Drawing.Size(59, 12);
this.label2.TabIndex = 3;
this.label2.Text = "保存位置:";
//
// txtSaveUrl
//
this.txtSaveUrl.Location = new System.Drawing.Point(97, 79);
this.txtSaveUrl.Name = "txtSaveUrl";
this.txtSaveUrl.ReadOnly = true;
this.txtSaveUrl.Size = new System.Drawing.Size(326, 21);
this.txtSaveUrl.TabIndex = 1;
this.txtSaveUrl.Text = "http://www.codefans.net/jscss/code/1866.shtml";
//
// timer1
//
this.timer1.Enabled = true;
this.timer1.Interval = 1000;
this.timer1.Tick += new System.EventHandler(this.timer1_Tick);
//
// label3
//
this.label3.AutoSize = true;
this.label3.Location = new System.Drawing.Point(30, 141);
this.label3.Name = "label3";
this.label3.Size = new System.Drawing.Size(59, 12);
this.label3.TabIndex = 4;
this.label3.Text = "当前时间:";
//
// lblTime
//
this.lblTime.AutoSize = true;
this.lblTime.Location = new System.Drawing.Point(95, 141);
this.lblTime.Name = "lblTime";
this.lblTime.Size = new System.Drawing.Size(35, 12);
this.lblTime.TabIndex = 5;
this.lblTime.Text = "12:00";
//
// label4
//
this.label4.AutoSize = true;
this.label4.Location = new System.Drawing.Point(30, 200);
this.label4.Name = "label4";
this.label4.Size = new System.Drawing.Size(59, 12);
this.label4.TabIndex = 4;
this.label4.Text = "共用时间:";
//
// lblTimeElapsed
//
this.lblTimeElapsed.AutoSize = true;
this.lblTimeElapsed.Location = new System.Drawing.Point(95, 201);
this.lblTimeElapsed.Name = "lblTimeElapsed";
this.lblTimeElapsed.Size = new System.Drawing.Size(11, 12);
this.lblTimeElapsed.TabIndex = 5;
this.lblTimeElapsed.Text = "0";
//
// btnStop
//
this.btnStop.Location = new System.Drawing.Point(290, 263);
this.btnStop.Name = "btnStop";
this.btnStop.Size = new System.Drawing.Size(75, 23);
this.btnStop.TabIndex = 0;
this.btnStop.Text = "S&top";
this.btnStop.UseVisualStyleBackColor = true;
this.btnStop.Click += new System.EventHandler(this.btnStop_Click);
//
// btnPause
//
this.btnPause.Location = new System.Drawing.Point(194, 263);
this.btnPause.Name = "btnPause";
this.btnPause.Size = new System.Drawing.Size(75, 23);
this.btnPause.TabIndex = 0;
this.btnPause.Text = "&Pause";
this.btnPause.UseVisualStyleBackColor = true;
this.btnPause.Click += new System.EventHandler(this.btnPause_Click);
//
// form1
//
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 12F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.ClientSize = new System.Drawing.Size(442, 313);
this.Controls.Add(this.lblTimeElapsed);
this.Controls.Add(this.lblTime);
this.Controls.Add(this.label4);
this.Controls.Add(this.label3);
this.Controls.Add(this.label2);
this.Controls.Add(this.label1);
this.Controls.Add(this.txtSaveUrl);
this.Controls.Add(this.txtUrl);
this.Controls.Add(this.btnStop);
this.Controls.Add(this.btnPause);
this.Controls.Add(this.btnStart);
this.Name = "form1";
this.Text = "提取网页";
this.FormClosed += new System.Windows.Forms.FormClosedEventHandler(this.form1_FormClosed);
this.ResumeLayout(false);
this.PerformLayout();
}
#endregion
private System.Windows.Forms.Button btnStart;
private System.Windows.Forms.TextBox txtUrl;
private System.Windows.Forms.Label label1;
private System.Windows.Forms.Label label2;
private System.Windows.Forms.TextBox txtSaveUrl;
private System.Windows.Forms.Timer timer1;
private System.Windows.Forms.Label label3;
private System.Windows.Forms.Label lblTime;
private System.Windows.Forms.Label label4;
private System.Windows.Forms.Label lblTimeElapsed;
private System.Windows.Forms.Button btnStop;
private System.Windows.Forms.Button btnPause;
}
}