zoukankan      html  css  js  c++  java
  • 基于深度优先搜索的蜘蛛程序


         这几天发现一个很好的图片网站,美女特多! 就打算下点图片,但是自己下载的话,翻来覆去的太麻烦,所以用找了个蜘蛛来帮忙。
    随便在网上查了查,就下载了一个名叫WebSpider的蜘蛛程序。我仔细研究了一下,感觉作者也是写着玩来着,意思意思,网页下载下来
    基本就丢了,另外结构上感觉不太满意,所以改改了。
          我大致想的采用双线程,一个UI,一个工作线程,抓取方面采用深度优先搜索,基本思路:得到当前网页,提取下载图片,然后正则表达式匹配网址,然后递归处理!在处理过程中,使用一个集合类来收集处理过的网址防止死循环。代码大致如下:
      1   public bool Process( WebPageState state )
      2      {
      3         state.ProcessStarted       = true;
      4         state.ProcessSuccessfull   = false;
      5
      6          if(level==1)
      7             m_baseUri = state.Uri;
      8         try
      9         {
     10            Console.WriteLine( "Process Uri: {0}", state.Uri.AbsoluteUri );
     11
     12            WebRequest  req = WebRequest.Create( state.Uri );
     13            WebResponse res = null;
     14
     15            try
     16            {
     17               res = req.GetResponse( );
     18
     19               if ( res is HttpWebResponse )
     20               {
     21                  state.StatusCode        = ((HttpWebResponse)res).StatusCode.ToString( );
     22                  state.StatusDescription = ((HttpWebResponse)res).StatusDescription;
     23               }

     24               if ( res is FileWebResponse )
     25               {
     26                  state.StatusCode        = "OK";
     27                  state.StatusDescription = "OK";
     28               }

     29
     30               if ( state.StatusCode.Equals( "OK" ) )
     31               {
     32                  StreamReader   sr    = new StreamReader( res.GetResponseStream( ) );
     33            
     34                  state.Content        = sr.ReadToEnd( );
     35
     36
     37                  MatchCollection m = RegExUtil.GetMatchRegEx(RegularExpression.SrcExtractor, state.Content, true);
     38                  string Address;
     39                   int k=0;
     40                   for (k = 0; k < m.Count;k++)
     41                   {
     42
     43
     44                       Address = m[k].Groups[1].ToString();
     45                       Uri uri = new Uri(state.Uri, m[k].Groups["url"].ToString());
     46                       // statusBar.Text = "Address: " + Address;
     47                       if (!m_pages.Contains(uri.AbsoluteUri))
     48                       {
     49                           m_pages.Add(uri.AbsoluteUri);
     50                           DownloadImage(state.Uri, Address);
     51                           if (this.ContentHandler != null)
     52                           {
     53                               state.mes.MaxProgress = m.Count;
     54                             
     55                               state.mes.Progress = k+1;
     56                               state.mes.Result = state.Uri.AbsoluteUri;
     57                               state.mes.Status = TaskStatus.Running;
     58                               state.mes.Message = "当前共有图片下载数"+m.Count+"  现在正在下载第"+state.mes.Progress.ToString()+"图片" + Address;
     59                               ContentHandler.Invoke(state);
     60                           }

     61                       }

     62
     63                     
     64                   }

     65               
     66                      int counter = 0;
     67                      Match mm= RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.Content);
     68
     69                      while (mm.Success)
     70                      {
     71                          Uri uri = new Uri(state.Uri, mm.Groups["url"].ToString());
     72                          if (ValidPage(uri) && !m_pages.Contains(uri.AbsoluteUri))
     73                          {
     74                              if (level > 10)
     75                                  return true;
     76                              counter++;
     77                              level++;
     78                              WebPageState statec = new WebPageState(uri);
     79                              m_pages.Add(uri.AbsoluteUri);
     80                              Process(statec);
     81                          }

     82
     83
     84                          mm = mm.NextMatch();
     85                      }

     86               
     87               }

     88
     89               state.ProcessSuccessfull = true;
     90            }

     91            catch( Exception ex )
     92            {
     93               HandleException( ex, state );
     94            }

     95            finally
     96            {
     97               if ( res != null )
     98               {
     99                  res.Close( );
    100               }

    101            }

    102         }

    103         catch (Exception ex)
    104         {
    105            Console.WriteLine( ex.ToString( ) );
    106         }

    107         Console.WriteLine( "Successfull: {0}", state.ProcessSuccessfull );
    108
    109         return state.ProcessSuccessfull;
    110      }

    111      #endregion
    112
    113
    114       private void DownloadImage(Uri m_bb,string imgUri)
    115       {
    116           Uri imageUri = null;
    117           string ext = null;
    118           string outFile = null;
    119
    120
    121           try
    122           {
    123               imageUri = new Uri(m_bb, imgUri);
    124
    125               ext = StrUtil.RightLastIndexOf(imageUri.AbsoluteUri, ".").ToLower();
    126               outFile = "temp\\img" + (m_fileId+++ "." + ext;
    127
    128               if ("jpg|jpeg|swf".IndexOf(ext) > -1)
    129               {
    130                   WebClient web = new WebClient();
    131                   web.DownloadFile(imageUri.AbsoluteUri,outFile);
    132                 //  byte[] image=web.DownloadData(imageUri);
    133                   
    134                   if (ext == "swf")
    135                   {
    136                       //m_graphicViewerWriter.WriteLine("<object classid='clsid:D27CDB6E-AE6D-11cf-96B8-444553540000' codebase='http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,29,0' width='150' height='100'>");
    137                       //m_graphicViewerWriter.WriteLine("<param name='movie' value='" + outFile + "'>");
    138                       //m_graphicViewerWriter.WriteLine("<param name=quality value=high>");
    139                       //m_graphicViewerWriter.WriteLine("<embed src='" + outFile + "' quality=high pluginspage='http://www.macromedia.com/shockwave/download/index.cgi?P1_Prod_Version=ShockwaveFlash' type='application/x-shockwave-flash' width='150' height='100'></embed>");
    140                       //m_graphicViewerWriter.WriteLine("</object>");
    141                   }

    142                   else
    143                   {
    144                       // m_graphicViewerWriter.WriteLine( "<img src='file://" + outFile + "' /><br />");
    145                       //img" + ( m_fileId++ ) + "." + ext;
    146                       //m_graphicViewerWriter.WriteLine("<img src='img" + (m_fileId - 1) + "." + ext + "' /><br />");
    147                   }

    148               }

    149           }

    150           catch (Exception)
    151           {
    152              // m_graphicViewerWriter.WriteLine("could not download img: " + imageUri.AbsoluteUri);
    153           }

    154       }
       现在基本可以下载图片了,不过感觉要优化的地方较多! 递归的层级暂时没有控制,性能也是一般,代码的结构还是比较乱,后续再重构了!
  • 相关阅读:
    HDU 3951 (博弈) Coin Game
    HDU 3863 (博弈) No Gambling
    HDU 3544 (不平等博弈) Alice's Game
    POJ 3225 (线段树 区间更新) Help with Intervals
    POJ 2528 (线段树 离散化) Mayor's posters
    POJ 3468 (线段树 区间增减) A Simple Problem with Integers
    HDU 1698 (线段树 区间更新) Just a Hook
    POJ (线段树) Who Gets the Most Candies?
    POJ 2828 (线段树 单点更新) Buy Tickets
    HDU 2795 (线段树 单点更新) Billboard
  • 原文地址:https://www.cnblogs.com/jacky0952/p/spider.html
Copyright © 2011-2022 走看看