zoukankan      html  css  js  c++  java
  • Perl爬虫研究






    use strict;
    use warnings;
    use threads;
    use threads::shared;
    use Thread::Queue;
    use Thread::Semaphore;
    use Bloom::Filter;
    use URI::URL;
    use Web::Scraper;
    # 设置线程数量 
    my $max_threads = 15;
    # 设置目标
    my $base_url = $ARGV[0] || 'http://www.icylife.net';
    # 这个没解释出来(P1)
    my $host = URI::URL->new($base_url)->host;
    # 建立一个线程队列
    my $queue = Thread::Queue->new( );
    # 创建信号量容器并锁定峰值
    my $semaphore = Thread::Semaphore->new( $max_threads );
    # 每次创建一个信号量
    my $mutex = Thread::Semaphore->new( 1 );
    # BS算法,用于测试URL是否重复 
    my $filter = shared_clone( Bloom::Filter->new(capacity => 1000, error_rate => 0.0001) );
    # 将目标放入任务队列 
    $queue->enqueue( $base_url );
    # 放入需要对比的第一个成员
    $filter->add( $base_url );
    while( 1 )
            # join all threads which can be joined
            #my $joined = 0;
            foreach ( threads->list(threads::joinable) )
                    #$joined ++;
                    $_->join( );
            #print $joined, " joined
            # if there are no url need process.
            my $item = $queue->pending();
            if( $item == 0 )
                    my $active = threads->list(threads::running);
                    # there are no active thread, we finish the job
                    if( $active == 0 )
                            print "All done!
                    # we will get some more url if there are some active threads, just wait for them
                            #print "[MAIN] 0 URL, but $active active thread
                            sleep 1;
            # if there are some url need process
            #print "[MAIN] $item URLn";
            #print "[MAIN]Create thread.n";
            threads->create( &ProcessUrl );
    # join all threads which can be joined
    foreach ( threads->list() )
            $_->join( );
    sub ProcessUrl
            my $scraper = scraper
                    process '//a', 'links[]' => '@href';
            my $res;
            my $link;
            while( my $url = $queue->dequeue_nb() )
                            $res = $scraper->scrape( URI->new($url) )->{'links'};
                    if( $@ )
                            warn "$@
                    next if (! defined $res );
                    #print "there are ".scalar(threads->list(threads::running))." threads, ", $queue->pending(), " urls need process.
                    foreach( @{$res} )
                            $link = $_->as_string;
                            $link = URI::URL->new($link, $url);
                            # not http and not https?
                            next if( $link->scheme ne 'http' && $link->scheme ne 'https' );
                            # another domain?
                            next if( $link->host ne $host );
                            $link = $link->abs->as_string;
                            if( $link =~ /(.*?)#(.*)/ )
                                    $link = $1;
                            next if( $link =~ /.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|doc|js|css|docx|xls|xlsx)$/i );
                            if( ! $filter->check($link) )
                                    print $filter->key_count(), " ", $link, "
                            undef $link;
                    undef $res;
            undef $scraper;
            $semaphore->up( );
  • 相关阅读:
    第四章 瞬时响应:网站的高性能架构(待续)
    第三章 大型网站核心架构要素(待续)
    Luogu P1140 相似基因 【dp】By cellur925
    矩阵快速幂/矩阵加速线性数列 By cellur925
    [POI2008]BLO-Blockade 【无向图tarjan/鸽点】By cellur925
    USACO Training3.3 A Game【区间Dp】 By cellur925
    Luogu P2858 [USACO06FEB]奶牛零食Treats for the Cows 【区间dp】By cellur925
    Luogu P2921 在农场万圣节 【tarjan in 有向图】 By cellur925
    浅谈扩展欧几里得[exgcd] By cellur925
    NOIp 2014 联合权值 By cellur925
  • 原文地址:https://www.cnblogs.com/xiaoCon/p/3346422.html
Copyright © 2011-2022 走看看