zoukankan      html  css  js  c++  java
  • perl爬虫

    爬行深度之类的我没加,加上也容易,几句话的事情。直接代码描述吧。我C写得多一些,所以perl代码的风格不怎么perl。

    #d:\perl\bin\perl.exe -w
    use warnings;
     
    use Data::Dumper;
    use threads;
    use threads::shared;
    use Thread::Queue;
    use Thread::Semaphore;
     
    use Bloom::Filter;
    use URI::URL;
    use LWP::Simple;
    use LWP::UserAgent;
    use HTTP::Cookies;
    use Web::Scraper;
     
    my $max_threads = 30;
    my $base_url = $ARGV[0] || 'http://www.icylife.net/';
    my $host = URI->new($base_url)->host;
     
    my $queue = Thread::Queue->new( );
    my $semaphore = Thread::Semaphore->new( $max_threads );
    my $filter= shared_clone(Bloom::Filter->new(capacity => 50000, error_rate => 0.001) );
     
    $queue->enqueue( $base_url );
    $filter->add( $base_url );
     
    while( 1 )
    {
     # join all threads which can be joined
     foreach ( threads->list(threads::joinable) )
     {
      $_->join( );
     }
     
     # if there are no url need process.
     my $item = $queue->pending();
     if( $item == 0 )
     {
      # there are no active thread, we finish the job
      if( threads->list(threads::running) == 0 )
      {
       print "All done!n";
       last;
      }
      # we will get some more url if there are some active threads, just wait for them
      else
      {
       sleep 1;
       next;
      }
     }
     
     # if there are some url need process
     while( $semaphore->down )
     {
      threads->create( &ProcessUrl );
     }
    }
     
    # join all threads which can be joined
    foreach ( threads->list() )
    {
     $_->join( );
    }
     
    sub ProcessUrl
    {
     my $scraper = scraper
     {
      process '//a', 'links[]' => '@href';
     };
     
     my $res;
     my $link;
     
     while( my $url = $queue->dequeue_nb() )
     {
        eval
        {
            $res = $scraper->scrape( URI->new($url) )->{'links'};
        };
        if( $@ )
        {
            warn "$@\n";
            next;
        }
        next if (! defined $res );

      #print "there are ".scalar(threads->list(threads::running))." threads, ", $queue->pending(), " urls need process.n";
     
      foreach( @{$res} )
      {
       $link = $_->as_string;
       $link = URI::URL->new($link, $url);
     
       # not http and not https?
       next if( $link->scheme ne 'http' && $link->scheme ne 'https' );
       # another domain?
       next if( $link->host ne $host );
     
       $link = $link->abs->as_string;
     
       if( $link =~ /(.*?)#(.*)/ )
       {
        $link = $1;
       }
     
       next if( $link =~ /.(jpg|png|zip|rar|iso)$/i );
     
       if( ! $filter->check($link) )
       {
        print $filter->key_count(), " ", $link, "n";
        $filter->add($link);
        $queue->enqueue($link);
       }
      }
     }
     $semaphore->up( );
    }

    摘自:http://bbs.chinaunix.net/thread-1635304-1-1.html

  • 相关阅读:
    简单布局2
    面试问题之操作系统:Linux下进程的内存结构
    面试问题之C++语言:说一说C++中四种cast转换
    面试问题之C++语言:说一下static关键字的作用
    面试问题之操作系统:动态链接库和静态链接库的区别
    面试问题之数据结构与算法:简述深度优先遍历和广度优先遍历
    面试问题之C++语言:C与C++的区别
    面试问题之C++语言:Overload、Override及Overwirte的区别
    面试问题之计算机网络:TCP三次握手四次挥手
    面试问题之计算机网络:TCP滑动窗口
  • 原文地址:https://www.cnblogs.com/djcsch2001/p/2404153.html
Copyright © 2011-2022 走看看