zoukankan      html  css  js  c++  java
  • Perl爬虫研究

    这几天忙着做项目和一些W3A的测试,没啥时间研究别的.

    今天趁着快放假,也给自己放放假吧.看了下云总写的Perl爬虫,发现有多处不懂.

    但是部分地方算是理解了,看来目标还是很遥远的.

    给代码加了下注释,不过太累的,准备睡觉了..写了部分,改天补全..

    凑合着看吧....

    #!/usr/bin/perl
    use strict;
    use warnings;
    use threads;
    use threads::shared;
    use Thread::Queue;
    use Thread::Semaphore;
     
    use Bloom::Filter;
    use URI::URL;
    use Web::Scraper;
    
    # 设置线程数量 
    my $max_threads = 15;
    
    # 设置目标
    my $base_url = $ARGV[0] || 'http://www.icylife.net';
    
    # 这个没解释出来(P1)
    my $host = URI::URL->new($base_url)->host;
    
    # 建立一个线程队列
    my $queue = Thread::Queue->new( );
     
    # 创建信号量容器并锁定峰值
    my $semaphore = Thread::Semaphore->new( $max_threads );
    
    # 每次创建一个信号量
    my $mutex = Thread::Semaphore->new( 1 );
    
    # BS算法,用于测试URL是否重复 
    my $filter = shared_clone( Bloom::Filter->new(capacity => 1000, error_rate => 0.0001) );
    
    # 将目标放入任务队列 
    $queue->enqueue( $base_url );
    
    # 放入需要对比的第一个成员
    $filter->add( $base_url );
    
     
    while( 1 )
    {
            # join all threads which can be joined
            #my $joined = 0;
            foreach ( threads->list(threads::joinable) )
            {
                    #$joined ++;
                    $_->join( );
            }
            #print $joined, " joined
    ";
     
            # if there are no url need process.
            my $item = $queue->pending();
            if( $item == 0 )
            {
                    my $active = threads->list(threads::running);
                    # there are no active thread, we finish the job
                    if( $active == 0 )
                    {
                            print "All done!
    ";
                            last;
                    }
                    # we will get some more url if there are some active threads, just wait for them
                    else
                    {
                            #print "[MAIN] 0 URL, but $active active thread
    ";
                            sleep 1;
                            next;
                    }
            }
     
            # if there are some url need process
            #print "[MAIN] $item URLn";
            $semaphore->down;
            #print "[MAIN]Create thread.n";
            threads->create( &ProcessUrl );
    }
     
    # join all threads which can be joined
    foreach ( threads->list() )
    {
            $_->join( );
    }
     
    sub ProcessUrl
    {
            my $scraper = scraper
            {
                    process '//a', 'links[]' => '@href';
            };
     
            my $res;
            my $link;
     
            while( my $url = $queue->dequeue_nb() )
            {
                    eval
                    {
                            $res = $scraper->scrape( URI->new($url) )->{'links'};
                    };
                    if( $@ )
                    {
                            warn "$@
    ";
                            next;
                    }
                    next if (! defined $res );
     
                    #print "there are ".scalar(threads->list(threads::running))." threads, ", $queue->pending(), " urls need process.
    ";
     
                    foreach( @{$res} )
                    {
                            $link = $_->as_string;
                            $link = URI::URL->new($link, $url);
     
                            # not http and not https?
                            next if( $link->scheme ne 'http' && $link->scheme ne 'https' );
                            # another domain?
                            next if( $link->host ne $host );
     
                            $link = $link->abs->as_string;
     
                            if( $link =~ /(.*?)#(.*)/ )
                            {
                                    $link = $1;
                            }
     
                            next if( $link =~ /.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|doc|js|css|docx|xls|xlsx)$/i );
     
                            $mutex->down();
                            if( ! $filter->check($link) )
                            {
                                    print $filter->key_count(), " ", $link, "
    ";
                                    $filter->add($link);
                                    $queue->enqueue($link);
                            }
                            $mutex->up();
                            undef $link;
                    }
                    undef $res;
            }
            undef $scraper;
            $semaphore->up( );
    }
  • 相关阅读:
    第四章 瞬时响应:网站的高性能架构(待续)
    第三章 大型网站核心架构要素(待续)
    Luogu P1140 相似基因 【dp】By cellur925
    矩阵快速幂/矩阵加速线性数列 By cellur925
    [POI2008]BLO-Blockade 【无向图tarjan/鸽点】By cellur925
    USACO Training3.3 A Game【区间Dp】 By cellur925
    Luogu P2858 [USACO06FEB]奶牛零食Treats for the Cows 【区间dp】By cellur925
    Luogu P2921 在农场万圣节 【tarjan in 有向图】 By cellur925
    浅谈扩展欧几里得[exgcd] By cellur925
    NOIp 2014 联合权值 By cellur925
  • 原文地址:https://www.cnblogs.com/xiaoCon/p/3346422.html
Copyright © 2011-2022 走看看