zoukankan      html  css  js  c++  java
  • 基于深度优先搜索的蜘蛛程序


         这几天发现一个很好的图片网站,美女特多! 就打算下点图片,但是自己下载的话,翻来覆去的太麻烦,所以用找了个蜘蛛来帮忙。
    随便在网上查了查,就下载了一个名叫WebSpider的蜘蛛程序。我仔细研究了一下,感觉作者也是写着玩来着,意思意思,网页下载下来
    基本就丢了,另外结构上感觉不太满意,所以改改了。
          我大致想的采用双线程,一个UI,一个工作线程,抓取方面采用深度优先搜索,基本思路:得到当前网页,提取下载图片,然后正则表达式匹配网址,然后递归处理!在处理过程中,使用一个集合类来收集处理过的网址防止死循环。代码大致如下:
      1   public bool Process( WebPageState state )
      2      {
      3         state.ProcessStarted       = true;
      4         state.ProcessSuccessfull   = false;
      5
      6          if(level==1)
      7             m_baseUri = state.Uri;
      8         try
      9         {
     10            Console.WriteLine( "Process Uri: {0}", state.Uri.AbsoluteUri );
     11
     12            WebRequest  req = WebRequest.Create( state.Uri );
     13            WebResponse res = null;
     14
     15            try
     16            {
     17               res = req.GetResponse( );
     18
     19               if ( res is HttpWebResponse )
     20               {
     21                  state.StatusCode        = ((HttpWebResponse)res).StatusCode.ToString( );
     22                  state.StatusDescription = ((HttpWebResponse)res).StatusDescription;
     23               }

     24               if ( res is FileWebResponse )
     25               {
     26                  state.StatusCode        = "OK";
     27                  state.StatusDescription = "OK";
     28               }

     29
     30               if ( state.StatusCode.Equals( "OK" ) )
     31               {
     32                  StreamReader   sr    = new StreamReader( res.GetResponseStream( ) );
     33            
     34                  state.Content        = sr.ReadToEnd( );
     35
     36
     37                  MatchCollection m = RegExUtil.GetMatchRegEx(RegularExpression.SrcExtractor, state.Content, true);
     38                  string Address;
     39                   int k=0;
     40                   for (k = 0; k < m.Count;k++)
     41                   {
     42
     43
     44                       Address = m[k].Groups[1].ToString();
     45                       Uri uri = new Uri(state.Uri, m[k].Groups["url"].ToString());
     46                       // statusBar.Text = "Address: " + Address;
     47                       if (!m_pages.Contains(uri.AbsoluteUri))
     48                       {
     49                           m_pages.Add(uri.AbsoluteUri);
     50                           DownloadImage(state.Uri, Address);
     51                           if (this.ContentHandler != null)
     52                           {
     53                               state.mes.MaxProgress = m.Count;
     54                             
     55                               state.mes.Progress = k+1;
     56                               state.mes.Result = state.Uri.AbsoluteUri;
     57                               state.mes.Status = TaskStatus.Running;
     58                               state.mes.Message = "当前共有图片下载数"+m.Count+"  现在正在下载第"+state.mes.Progress.ToString()+"图片" + Address;
     59                               ContentHandler.Invoke(state);
     60                           }

     61                       }

     62
     63                     
     64                   }

     65               
     66                      int counter = 0;
     67                      Match mm= RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.Content);
     68
     69                      while (mm.Success)
     70                      {
     71                          Uri uri = new Uri(state.Uri, mm.Groups["url"].ToString());
     72                          if (ValidPage(uri) && !m_pages.Contains(uri.AbsoluteUri))
     73                          {
     74                              if (level > 10)
     75                                  return true;
     76                              counter++;
     77                              level++;
     78                              WebPageState statec = new WebPageState(uri);
     79                              m_pages.Add(uri.AbsoluteUri);
     80                              Process(statec);
     81                          }

     82
     83
     84                          mm = mm.NextMatch();
     85                      }

     86               
     87               }

     88
     89               state.ProcessSuccessfull = true;
     90            }

     91            catch( Exception ex )
     92            {
     93               HandleException( ex, state );
     94            }

     95            finally
     96            {
     97               if ( res != null )
     98               {
     99                  res.Close( );
    100               }

    101            }

    102         }

    103         catch (Exception ex)
    104         {
    105            Console.WriteLine( ex.ToString( ) );
    106         }

    107         Console.WriteLine( "Successfull: {0}", state.ProcessSuccessfull );
    108
    109         return state.ProcessSuccessfull;
    110      }

    111      #endregion
    112
    113
    114       private void DownloadImage(Uri m_bb,string imgUri)
    115       {
    116           Uri imageUri = null;
    117           string ext = null;
    118           string outFile = null;
    119
    120
    121           try
    122           {
    123               imageUri = new Uri(m_bb, imgUri);
    124
    125               ext = StrUtil.RightLastIndexOf(imageUri.AbsoluteUri, ".").ToLower();
    126               outFile = "temp\\img" + (m_fileId+++ "." + ext;
    127
    128               if ("jpg|jpeg|swf".IndexOf(ext) > -1)
    129               {
    130                   WebClient web = new WebClient();
    131                   web.DownloadFile(imageUri.AbsoluteUri,outFile);
    132                 //  byte[] image=web.DownloadData(imageUri);
    133                   
    134                   if (ext == "swf")
    135                   {
    136                       //m_graphicViewerWriter.WriteLine("<object classid='clsid:D27CDB6E-AE6D-11cf-96B8-444553540000' codebase='http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,29,0' width='150' height='100'>");
    137                       //m_graphicViewerWriter.WriteLine("<param name='movie' value='" + outFile + "'>");
    138                       //m_graphicViewerWriter.WriteLine("<param name=quality value=high>");
    139                       //m_graphicViewerWriter.WriteLine("<embed src='" + outFile + "' quality=high pluginspage='http://www.macromedia.com/shockwave/download/index.cgi?P1_Prod_Version=ShockwaveFlash' type='application/x-shockwave-flash' width='150' height='100'></embed>");
    140                       //m_graphicViewerWriter.WriteLine("</object>");
    141                   }

    142                   else
    143                   {
    144                       // m_graphicViewerWriter.WriteLine( "<img src='file://" + outFile + "' /><br />");
    145                       //img" + ( m_fileId++ ) + "." + ext;
    146                       //m_graphicViewerWriter.WriteLine("<img src='img" + (m_fileId - 1) + "." + ext + "' /><br />");
    147                   }

    148               }

    149           }

    150           catch (Exception)
    151           {
    152              // m_graphicViewerWriter.WriteLine("could not download img: " + imageUri.AbsoluteUri);
    153           }

    154       }
       现在基本可以下载图片了,不过感觉要优化的地方较多! 递归的层级暂时没有控制,性能也是一般,代码的结构还是比较乱,后续再重构了!
  • 相关阅读:
    centos6 下erlang安装
    待研究
    关键字拦截查询
    获取CNVD的cookie
    adb pull 文件夹到电脑
    Linux中查看端口占用情况
    Running Tensorflow on AMD GPU
    验证码识别相关文章
    conda和pip相关操作
    windows安装pycrypto报错
  • 原文地址:https://www.cnblogs.com/jacky0952/p/spider.html
Copyright © 2011-2022 走看看