zoukankan      html  css  js  c++  java
  • perl爬虫

    爬行深度之类的我没加,加上也容易,几句话的事情。直接代码描述吧。我C写得多一些,所以perl代码的风格不怎么perl。

    #d:\perl\bin\perl.exe -w
    use warnings;
     
    use Data::Dumper;
    use threads;
    use threads::shared;
    use Thread::Queue;
    use Thread::Semaphore;
     
    use Bloom::Filter;
    use URI::URL;
    use LWP::Simple;
    use LWP::UserAgent;
    use HTTP::Cookies;
    use Web::Scraper;
     
    my $max_threads = 30;
    my $base_url = $ARGV[0] || 'http://www.icylife.net/';
    my $host = URI->new($base_url)->host;
     
    my $queue = Thread::Queue->new( );
    my $semaphore = Thread::Semaphore->new( $max_threads );
    my $filter= shared_clone(Bloom::Filter->new(capacity => 50000, error_rate => 0.001) );
     
    $queue->enqueue( $base_url );
    $filter->add( $base_url );
     
    while( 1 )
    {
     # join all threads which can be joined
     foreach ( threads->list(threads::joinable) )
     {
      $_->join( );
     }
     
     # if there are no url need process.
     my $item = $queue->pending();
     if( $item == 0 )
     {
      # there are no active thread, we finish the job
      if( threads->list(threads::running) == 0 )
      {
       print "All done!n";
       last;
      }
      # we will get some more url if there are some active threads, just wait for them
      else
      {
       sleep 1;
       next;
      }
     }
     
     # if there are some url need process
     while( $semaphore->down )
     {
      threads->create( &ProcessUrl );
     }
    }
     
    # join all threads which can be joined
    foreach ( threads->list() )
    {
     $_->join( );
    }
     
    sub ProcessUrl
    {
     my $scraper = scraper
     {
      process '//a', 'links[]' => '@href';
     };
     
     my $res;
     my $link;
     
     while( my $url = $queue->dequeue_nb() )
     {
        eval
        {
            $res = $scraper->scrape( URI->new($url) )->{'links'};
        };
        if( $@ )
        {
            warn "$@\n";
            next;
        }
        next if (! defined $res );

      #print "there are ".scalar(threads->list(threads::running))." threads, ", $queue->pending(), " urls need process.n";
     
      foreach( @{$res} )
      {
       $link = $_->as_string;
       $link = URI::URL->new($link, $url);
     
       # not http and not https?
       next if( $link->scheme ne 'http' && $link->scheme ne 'https' );
       # another domain?
       next if( $link->host ne $host );
     
       $link = $link->abs->as_string;
     
       if( $link =~ /(.*?)#(.*)/ )
       {
        $link = $1;
       }
     
       next if( $link =~ /.(jpg|png|zip|rar|iso)$/i );
     
       if( ! $filter->check($link) )
       {
        print $filter->key_count(), " ", $link, "n";
        $filter->add($link);
        $queue->enqueue($link);
       }
      }
     }
     $semaphore->up( );
    }

    摘自:http://bbs.chinaunix.net/thread-1635304-1-1.html

  • 相关阅读:
    (转载)C++ string中find() ,rfind() 等函数 用法总结及示例
    UVA 230 Borrowers (STL 行读入的处理 重载小于号)
    UVA 12100 打印队列(STL deque)
    uva 12096 The SetStack Computer(STL set的各种库函数 交集 并集 插入迭代器)
    uva 1592 Database (STL)
    HDU 1087 Super Jumping! Jumping! Jumping!
    hdu 1176 免费馅饼
    HDU 1003 Max Sum
    转战HDU
    hust 1227 Join Together
  • 原文地址:https://www.cnblogs.com/djcsch2001/p/2404153.html
Copyright © 2011-2022 走看看