zoukankan      html  css  js  c++  java
  • 抓取源码爱好者所有网页特效例子并保存到本地

    提取http://www.codefans.net/jscss/code/1866.shtm等l类似网页中运行区块的html代码 并保存到本地.

    应该是将 源码爱好者 » 网页特效代码 下面的子目录都抓下来了, 我机器上运行了15min抓了有10几个子目录  共4M多

    用到了正则表达式,文件读取保存, 多线程

     是用vs2005写的, winform

    form1.cs

    代码
    /**************************************************************************************************************
     * 本程序多线程从特定网页中提取一块内容
     * 具体从http://www.codefans.net/jscss/code/1866.shtml提取中间演示textarea内的html文本
     * 从(网页特效代码)->(详细分类)->html网页中textarea内容
     * 
     * 程序内使用了1.多线程 2.正则表达式 3.web文件读取 4.本地文件保存及编码问题
     * 
     * 
     * 
     * 线程挂起没实现 好像用ThreadPool可以暂停纯种线程
     * 
     * 
     * 
     * 
     * 
     **************************************************************************************************************
    */
    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Text;
    using System.Windows.Forms;
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.Threading;

    namespace dig
    {
        
    public partial class form1 : Form
        {
            
    private Thread getFileThread = null;
            
    private DateTime startTime = DateTime.Now;
            
    private string strCurUrl = "";//当前处理的url
            private string strSaveUrl = "";//当前保存网页的url
            public form1()
            {
                InitializeComponent();
            }
            
            
    // http://www.codefans.net/jscss/code/1866.shtml
            private void btnStart_Click(object sender, EventArgs e)
            {
                
    if (getFileThread == null)
                {
                    getFileThread 
    = new Thread(new ThreadStart(GetFileAndSave));//新建一个线程
                    getFileThread.Start();//线程开始
                }
            }

            
    private void GetFileAndSave()
            {
                
    for (int i = 0; i < 9999++i)
                {
                    
    /*string strI = i + "";
                    while (strI.Length < 4)
                    {
                        strI = "0" + strI;
                    }
    */
                    CreateHtmlPage(
    @"http://www.codefans.net/jscss/code/" + i + ".shtml");
                    
    //Console.WriteLine(strI);

                }
            }

            
    private void CreateHtmlPage(string strUrl)
            {
                
    try
                {
                    
    //正在处理的url
                    strCurUrl = strUrl;

                    
    //读取文件
                    HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(strUrl);
                    HttpWebResponse myResp 
    = (HttpWebResponse)myReq.GetResponse();
                    StreamReader respStream 
    = new StreamReader(myResp.GetResponseStream(), Encoding.Default);
                    
    string respStr = respStream.ReadToEnd();
                    respStream.Close();

                    
    //得到文件名 以文件标题为文件名
                    string strReg = @"(?<=(<title>)).*(?=_源码爱好者</title>)";
                    
    string strFileName = new Regex(strReg).Match(respStr).ToString();

                    
    //得到文件夹名 从"网页特效代码"后面取100个字符分析
                    int iTemp = respStr.IndexOf("网页特效代码");
                    
    string strFloderName = respStr.Substring(iTemp, 100);
                    strReg 
    = @"(?<=(<a.*>)).*(?=</a>)";
                    strFloderName 
    = new Regex(strReg, RegexOptions.IgnoreCase).Match(strFloderName).ToString();

                    
    //取出<textarea></textarea>之间的字符
                    strReg = @"(?<=(<textarea.*?>))([\w\W]*)(?=</textarea>)";//?<=表示左环视 不包()里面东东  .表示任何字符除了/n  *?表示尽可能少的(好像是lazy) ?=右环视 不包括内容
                    Match match = new Regex(strReg).Match(respStr);
                    
    //将&quot;替换成"
                    strReg = @"&quot;";
                    respStr 
    = new Regex(strReg).Replace(match.ToString(), "\"");
                    //将&gt;替换成>
                    /*
                    strReg = @"&gt;";
                    respStr = new Regex(strReg).Replace(match.ToString(), ">");
                    //将&lt;替换成<
                    strReg = @"&lt;";
                    respStr = new Regex(strReg).Replace(match.ToString(), "<");
                    
    */
                    respStr 
    = respStr.Replace("&quot;""\"");
                    respStr = respStr.Replace("&lt;""<");
                    respStr 
    = respStr.Replace("&gt;"">");
                    
    //写入文件
                    string path = SaveFile(respStr, strFileName, strFloderName);
                    
    //处理完的url
                    strSaveUrl = path + "\\" + strFileName + ".html";
                }
                
    catch
                {
                    StreamWriter sw 
    = new StreamWriter(@"c:\error.txt"true, System.Text.Encoding.GetEncoding("gb2312"));//将不能读取的文件url写进txt文档
                    sw.Write(strUrl);
                    sw.Flush();
                    sw.Close();
                    
    this.strSaveUrl = "读取远程url失败, 未能保存";
                }
            }

            
    private static string SaveFile(string str, string strFileName, string strFloderName)
            {
                
    string path = @"c:\" + @"网页特效代码\" + strFloderName;
                
    if (!Directory.Exists(path))
                    Directory.CreateDirectory(path);
                StreamWriter sw 
    = new StreamWriter(path + "\\" + strFileName + ".html"true, System.Text.Encoding.GetEncoding("gb2312"));//System.Text.Encoding.Default;
                sw.Write(str);
                sw.Flush();
                sw.Close();
                
    return path;
                
    /*TextWriter myWriter = File.CreateText(path + "\\" + strFileName + ".html");//file只能以utf-8写入
                    myWriter.Write(respStr);
                    myWriter.Flush();
                    myWriter.Close();
    */
            }

            
    private void timer1_Tick(object sender, EventArgs e)
            {
                
    this.lblTime.Text = DateTime.Now.ToLocalTime().ToString();
                TimeSpan span 
    = DateTime.Now.Subtract(startTime);
                
    this.lblTimeElapsed.Text = span.Seconds.ToString();
                
    this.txtUrl.Text = strCurUrl;
                
    this.txtSaveUrl.Text = strSaveUrl;
            }

            
    private void btnStop_Click(object sender, EventArgs e)
            {
                getFileThread.Abort();
    //结束线程
            }

            
    private void btnPause_Click(object sender, EventArgs e)
            {
                
    //getFileThread.Suspend();//线程挂起
            }
            
    private void form1_FormClosed(object sender, FormClosedEventArgs e)
            {
                
    if(getFileThread != null)
                    getFileThread.Abort();
            }
        }
    }

    form1.desginer.cs
    代码
    namespace dig
    {
        
    partial class form1
        {
            
    /// <summary>
            
    /// 必需的设计器变量。
            
    /// </summary>
            private System.ComponentModel.IContainer components = null;

            
    /// <summary>
            
    /// 清理所有正在使用的资源。
            
    /// </summary>
            
    /// <param name="disposing">如果应释放托管资源,为 true;否则为 false。</param>
            protected override void Dispose(bool disposing)
            {
                
    if (disposing && (components != null))
                {
                    components.Dispose();
                }
                
    base.Dispose(disposing);
            }

            
    #region Windows 窗体设计器生成的代码

            
    /// <summary>
            
    /// 设计器支持所需的方法 - 不要
            
    /// 使用代码编辑器修改此方法的内容。
            
    /// </summary>
            private void InitializeComponent()
            {
                
    this.components = new System.ComponentModel.Container();
                
    this.btnStart = new System.Windows.Forms.Button();
                
    this.txtUrl = new System.Windows.Forms.TextBox();
                
    this.label1 = new System.Windows.Forms.Label();
                
    this.label2 = new System.Windows.Forms.Label();
                
    this.txtSaveUrl = new System.Windows.Forms.TextBox();
                
    this.timer1 = new System.Windows.Forms.Timer(this.components);
                
    this.label3 = new System.Windows.Forms.Label();
                
    this.lblTime = new System.Windows.Forms.Label();
                
    this.label4 = new System.Windows.Forms.Label();
                
    this.lblTimeElapsed = new System.Windows.Forms.Label();
                
    this.btnStop = new System.Windows.Forms.Button();
                
    this.btnPause = new System.Windows.Forms.Button();
                
    this.SuspendLayout();
                
    // 
                
    // btnStart
                
    // 
                this.btnStart.Location = new System.Drawing.Point(97263);
                
    this.btnStart.Name = "btnStart";
                
    this.btnStart.Size = new System.Drawing.Size(7523);
                
    this.btnStart.TabIndex = 0;
                
    this.btnStart.Text = "&Start";
                
    this.btnStart.UseVisualStyleBackColor = true;
                
    this.btnStart.Click += new System.EventHandler(this.btnStart_Click);
                
    // 
                
    // txtUrl
                
    // 
                this.txtUrl.Location = new System.Drawing.Point(9520);
                
    this.txtUrl.Name = "txtUrl";
                
    this.txtUrl.ReadOnly = true;
                
    this.txtUrl.Size = new System.Drawing.Size(32621);
                
    this.txtUrl.TabIndex = 1;
                
    this.txtUrl.Text = "http://www.codefans.net/jscss/code/1866.shtml";
                
    // 
                
    // label1
                
    // 
                this.label1.AutoSize = true;
                
    this.label1.Location = new System.Drawing.Point(3023);
                
    this.label1.Name = "label1";
                
    this.label1.Size = new System.Drawing.Size(5912);
                
    this.label1.TabIndex = 2;
                
    this.label1.Text = "远程文件:";
                
    // 
                
    // label2
                
    // 
                this.label2.AutoSize = true;
                
    this.label2.Location = new System.Drawing.Point(3082);
                
    this.label2.Name = "label2";
                
    this.label2.Size = new System.Drawing.Size(5912);
                
    this.label2.TabIndex = 3;
                
    this.label2.Text = "保存位置:";
                
    // 
                
    // txtSaveUrl
                
    // 
                this.txtSaveUrl.Location = new System.Drawing.Point(9779);
                
    this.txtSaveUrl.Name = "txtSaveUrl";
                
    this.txtSaveUrl.ReadOnly = true;
                
    this.txtSaveUrl.Size = new System.Drawing.Size(32621);
                
    this.txtSaveUrl.TabIndex = 1;
                
    this.txtSaveUrl.Text = "http://www.codefans.net/jscss/code/1866.shtml";
                
    // 
                
    // timer1
                
    // 
                this.timer1.Enabled = true;
                
    this.timer1.Interval = 1000;
                
    this.timer1.Tick += new System.EventHandler(this.timer1_Tick);
                
    // 
                
    // label3
                
    // 
                this.label3.AutoSize = true;
                
    this.label3.Location = new System.Drawing.Point(30141);
                
    this.label3.Name = "label3";
                
    this.label3.Size = new System.Drawing.Size(5912);
                
    this.label3.TabIndex = 4;
                
    this.label3.Text = "当前时间:";
                
    // 
                
    // lblTime
                
    // 
                this.lblTime.AutoSize = true;
                
    this.lblTime.Location = new System.Drawing.Point(95141);
                
    this.lblTime.Name = "lblTime";
                
    this.lblTime.Size = new System.Drawing.Size(3512);
                
    this.lblTime.TabIndex = 5;
                
    this.lblTime.Text = "12:00";
                
    // 
                
    // label4
                
    // 
                this.label4.AutoSize = true;
                
    this.label4.Location = new System.Drawing.Point(30200);
                
    this.label4.Name = "label4";
                
    this.label4.Size = new System.Drawing.Size(5912);
                
    this.label4.TabIndex = 4;
                
    this.label4.Text = "共用时间:";
                
    // 
                
    // lblTimeElapsed
                
    // 
                this.lblTimeElapsed.AutoSize = true;
                
    this.lblTimeElapsed.Location = new System.Drawing.Point(95201);
                
    this.lblTimeElapsed.Name = "lblTimeElapsed";
                
    this.lblTimeElapsed.Size = new System.Drawing.Size(1112);
                
    this.lblTimeElapsed.TabIndex = 5;
                
    this.lblTimeElapsed.Text = "0";
                
    // 
                
    // btnStop
                
    // 
                this.btnStop.Location = new System.Drawing.Point(290263);
                
    this.btnStop.Name = "btnStop";
                
    this.btnStop.Size = new System.Drawing.Size(7523);
                
    this.btnStop.TabIndex = 0;
                
    this.btnStop.Text = "S&top";
                
    this.btnStop.UseVisualStyleBackColor = true;
                
    this.btnStop.Click += new System.EventHandler(this.btnStop_Click);
                
    // 
                
    // btnPause
                
    // 
                this.btnPause.Location = new System.Drawing.Point(194263);
                
    this.btnPause.Name = "btnPause";
                
    this.btnPause.Size = new System.Drawing.Size(7523);
                
    this.btnPause.TabIndex = 0;
                
    this.btnPause.Text = "&Pause";
                
    this.btnPause.UseVisualStyleBackColor = true;
                
    this.btnPause.Click += new System.EventHandler(this.btnPause_Click);
                
    // 
                
    // form1
                
    // 
                this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 12F);
                
    this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
                
    this.ClientSize = new System.Drawing.Size(442313);
                
    this.Controls.Add(this.lblTimeElapsed);
                
    this.Controls.Add(this.lblTime);
                
    this.Controls.Add(this.label4);
                
    this.Controls.Add(this.label3);
                
    this.Controls.Add(this.label2);
                
    this.Controls.Add(this.label1);
                
    this.Controls.Add(this.txtSaveUrl);
                
    this.Controls.Add(this.txtUrl);
                
    this.Controls.Add(this.btnStop);
                
    this.Controls.Add(this.btnPause);
                
    this.Controls.Add(this.btnStart);
                
    this.Name = "form1";
                
    this.Text = "提取网页";
                
    this.FormClosed += new System.Windows.Forms.FormClosedEventHandler(this.form1_FormClosed);
                
    this.ResumeLayout(false);
                
    this.PerformLayout();

            }

            
    #endregion

            
    private System.Windows.Forms.Button btnStart;
            
    private System.Windows.Forms.TextBox txtUrl;
            
    private System.Windows.Forms.Label label1;
            
    private System.Windows.Forms.Label label2;
            
    private System.Windows.Forms.TextBox txtSaveUrl;
            
    private System.Windows.Forms.Timer timer1;
            
    private System.Windows.Forms.Label label3;
            
    private System.Windows.Forms.Label lblTime;
            
    private System.Windows.Forms.Label label4;
            
    private System.Windows.Forms.Label lblTimeElapsed;
            
    private System.Windows.Forms.Button btnStop;
            
    private System.Windows.Forms.Button btnPause;
        }
    }


     源码下载

  • 相关阅读:
    sql 相关子查询
    sql 执行计划
    SQL表连接查询(inner join、full join、left join、right join)
    sql执行顺序
    sql 语句 嵌套子查询 执行顺序分析
    只有程序员才看得懂的情书
    Give Me an E
    hdu 1114 (背包变形)
    模版 并查集
    背包 讲解
  • 原文地址:https://www.cnblogs.com/barrysgy/p/1766310.html
Copyright © 2011-2022 走看看