zoukankan      html  css  js  c++  java
  • Perl爬虫代码

    目前在做Perl页面爬虫的模块,发现一些代码,做个详细的分析,把好的引用一下给自己用用。

      1 #!/usr/bin/perl -w
      2 
      3 use strict; 
      4  
      5 use HTTP::Request; 
      6 use HTTP::Status; 
      7 use HTML::LinkExtor; 
      8 use URI::URL; 
      9 use LWP::UserAgent; 
     10 #use Digest::MD5  qw(md5_hex); 
     11  
     12  
     13 use Compress::Zlib; 
     14  
     15 #################################################################### 
     16 # Parameters Setting 
     17 our $StartUrl = "http://xxx"; 
     18 our $bRestrict = 1; 
     19 our @restrictSite = ('cxxx','context:'); 
     20 our $bContinueBefore = 1; 
     21  
     22  
     23 #################################################################### 
     24  
     25  
     26 print __FILE__,"\n"; 
     27  
     28 our %img_seen = (); 
     29 our %url_seen = (); 
     30 our @url_queue = (); 
     31 our %url_processed = (); 
     32  
     33 our %RobotDisallow = (); 
     34 our %RobotAllow = (); 
     35 our %site_seen = (); 
     36  
     37  
     38 if($bContinueBefore){ 
     39     &LoadBefore(); 
     40 }else{ 
     41     $url_seen{$StartUrl} = 1; 
     42     push @url_queue, $StartUrl; 
     43 } 
     44  
     45 our $pageNum = 0; 
     46 our $BucketNum = 0; 
     47  
     48 &OpenOutFile(); 
     49  
     50 open(URLHASH,">>urlhash.txt") or die; 
     51 open(URLPROCESSED,">>urlprocessed.txt") or die; 
     52 open(URLREDIRECT,">>urlredirect.txt") or die; 
     53 open(PAGELIST,">>pagelist.txt") or die; 
     54 open(IMGLIST,">>imglist.txt") or die; 
     55  
     56  
     57 $| = 1, select $_ for select URLHASH; 
     58 $| = 1, select $_ for select URLPROCESSED; 
     59 $| = 1, select $_ for select URLREDIRECT; 
     60 $| = 1, select $_ for select PAGELIST; 
     61 $| = 1, select $_ for select IMGLIST; 
     62  
     63 our $urlhash_log = *URLHASH; 
     64 our $urlprocessed_log = *URLPROCESSED; 
     65 our $urlredirect_log = *URLREDIRECT; 
     66 our $pagelist_log = *PAGELIST; 
     67 our $imglist_log = *IMGLIST; 
     68  
     69  
     70 our $UA =  new LWP::UserAgent(keep_alive =>  1, 
     71                               timeout    =>  60, 
     72                               ); 
     73 $UA->agent('Mozilla/5.0'); 
     74 $UA->proxy(['ftp', 'http', 'wais', 'gopher'],'http://jpproxy:80/'); 
     75  
     76 our $linkExtor = new HTML::LinkExtor(\&linkCallback); 
     77 our @tmpLinks = (); 
     78 our @tmpImgs = (); 
     79  
     80 my $url; 
     81 while ( $url = &next_url() ) 
     82 { 
     83     print $urlprocessed_log $url,"\n"; 
     84      
     85     #sleep(1000); 
     86      
     87     my $response = &get_url( $url ); 
     88      
     89     if(!defined $response){ 
     90         next; 
     91     } 
     92      
     93     my $base = $response->base; 
     94     $base = $base->as_string; 
     95     #$base =~ tr/A-Z/a-z/; 
     96      
     97     if ( $base ne $url ) 
     98     { 
     99         if(!&ValidUrl($base)){ 
    100             next; 
    101         } 
    102          
    103         print $urlredirect_log $url,"\t",$base,"\n"; 
    104          
    105         $url_seen{$base} ++; 
    106         print $urlhash_log $base,"\n"; 
    107          
    108         if(exists($url_processed{$base})){ 
    109                next; 
    110         } 
    111     } 
    112      
    113     my $contents = $response->content; 
    114  
    115     #my $digest = md5_hex($base);     
    116      
    117     &SavePage(\$base,\$contents); 
    118     print $pagelist_log $base,"\n"; 
    119     $url_processed{$base} ++; 
    120          
    121      
    122     @tmpLinks = (); 
    123     @tmpImgs = (); 
    124     $linkExtor->parse($contents); 
    125      
    126     foreach (@tmpLinks){ 
    127         $_ = URI::URL->new($_,$base)->abs->as_string; 
    128         #$_ =~ tr/A-Z/a-z/; 
    129     } 
    130      
    131     foreach (@tmpImgs){ 
    132         $_ = URI::URL->new($_,$base)->abs->as_string; 
    133         #$_ =~ tr/A-Z/a-z/; 
    134     } 
    135      
    136     #@tmpLinks = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpLinks; 
    137     #@tmpImgs = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpImgs; 
    138      
    139     &RecordLinks(); 
    140     &RecordImgs(); 
    141      
    142 } 
    143  
    144  
    145  
    146 sub next_url 
    147 { 
    148  
    149     # We return 'undef' to signify no URLs on the list 
    150     if (@url_queue == 0 ) 
    151     { 
    152         return undef; 
    153     } 
    154      
    155     return shift @url_queue; 
    156 } 
    157  
    158 sub get_url 
    159 { 
    160     my $url   = shift; 
    161  
    162     my $request = new HTTP::Request( 'HEAD', $url ); 
    163     return undef unless $request; 
    164  
    165     my $response = $UA->request( $request ); 
    166     return undef unless defined $response; 
    167     return undef unless $response->is_success; 
    168  
    169     my $content_type = $response->content_type(); 
    170     return undef unless defined $content_type; 
    171  
    172     return undef if 'text/html' ne $content_type; 
    173      
    174     $request = new HTTP::Request( 'GET', $url ); 
    175     return undef unless $request; 
    176  
    177     $response = $UA->request( $request ); 
    178     return undef unless defined $response; 
    179     return undef unless $response->is_success; 
    180       
    181     return $response; 
    182 } 
    183  
    184 sub linkCallback 
    185 { 
    186     my($tag, %attr) = @_; 
    187     if($tag eq 'a' || $tag eq 'frame' || $tag eq 'area'){ 
    188         push(@tmpLinks,values %attr); 
    189         return; 
    190     } 
    191     if($tag eq 'img'){ 
    192         push(@tmpImgs,values %attr); 
    193         return; 
    194     } 
    195     return; 
    196 } 
    197  
    198 sub RecordLinks 
    199 { 
    200     foreach (@tmpLinks){ 
    201         if(/\/.+\.(\w{1,4})$/){ 
    202             if($1 =~ /(html|htm|asp|php|jsp)/i){ 
    203  
    204             }elsif($1 =~ /(jpg|jpeg|bmp|png|gif)/i){ 
    205                 if(/^http/i){ 
    206                      
    207                     if(exists($img_seen{$_})){ 
    208                         next; 
    209                     } 
    210                      
    211                     $img_seen{$_} = 1; 
    212                     print $imglist_log $_,"\n"; 
    213                      
    214                 } 
    215                 next; 
    216  
    217             }else{ 
    218                 next; 
    219             } 
    220         } 
    221              
    222         #if(/\.(gif|jpg|jpeg|png|xbm|au|wav|mpg|pdf|ps|mp3|mp2|rm|zip|rar|gz|zip)$/i){ 
    223         #    next;             
    224         #} 
    225  
    226         if(/^http/i){ 
    227              
    228             if(!&ValidUrl($_)){ 
    229                 next; 
    230             } 
    231              
    232             s/#.*//;             
    233              
    234             if(exists($url_seen{$_})){ 
    235                 next; 
    236             } 
    237              
    238             $url_seen{$_} = 1; 
    239             push @url_queue,$_; 
    240             print $urlhash_log $_,"\n"; 
    241         } 
    242     } 
    243 } 
    244  
    245 sub RecordImgs 
    246 { 
    247     foreach (@tmpImgs){ 
    248         if(/^http/i){ 
    249             if(!&ValidImage($_)){ 
    250                 next; 
    251             } 
    252              
    253             if(exists($img_seen{$_})){ 
    254                 next; 
    255             } 
    256              
    257             $img_seen{$_} = 1; 
    258             print $imglist_log $_,"\n"; 
    259      
    260         } 
    261     } 
    262 } 
    263  
    264  
    265 sub LoadBefore 
    266 { 
    267     open(FILE, "urlprocessed.txt") or die; 
    268     while(<FILE>){ 
    269         chomp; 
    270         $url_processed{$_}++; 
    271     } 
    272      
    273     open(FILE, "pagelist.txt") or die; 
    274     while(<FILE>){ 
    275         if(/(\S+)\s/){ 
    276             $url_processed{$1}++; 
    277         } 
    278     } 
    279  
    280     open(FILE, "urlhash.txt") or die; 
    281     while(<FILE>){ 
    282         chomp; 
    283         $url_seen{$_}++; 
    284         if(!exists($url_processed{$_})){ 
    285             push @url_queue,$_; 
    286         } 
    287     } 
    288  
    289     open(FILE, "imglist.txt") or die; 
    290     while(<FILE>){ 
    291         chomp; 
    292         $img_seen{$_}++; 
    293     } 
    294      
    295 } 
    296  
    297  
    298 sub ValidUrl 
    299 { 
    300     my($url) = shift; 
    301     if($bRestrict){ 
    302         foreach (@restrictSite){ 
    303             if($url =~ /$_/){ 
    304                 return 1; 
    305             } 
    306         } 
    307         return 0; 
    308     }else{ 
    309         return 1; 
    310     } 
    311 } 
    312  
    313 sub ValidImage 
    314 { 
    315     my($url) = shift; 
    316     if($url =~ /#/){ 
    317         return 0; 
    318     } 
    319          
    320     if(/spacer\.gif/){ 
    321         return 0; 
    322     } 
    323  
    324     return 1; 
    325 } 
    326  
    327  
    328 sub get_robotstxt 
    329 { 
    330     my $url   = shift; 
    331     $url .= "/robots.txt"; 
    332      
    333     my $request = new HTTP::Request( 'HEAD', $url ); 
    334     return undef unless $request; 
    335      
    336     my $response = $UA->request( $request ); 
    337     return undef unless defined $response; 
    338     return undef unless $response->is_success; 
    339      
    340     my $content_type = $response->content_type(); 
    341     return undef unless defined $content_type; 
    342      
    343     return undef if 'text/plain' ne $content_type; 
    344      
    345     $request = new HTTP::Request( 'GET', $url ); 
    346     return undef unless $request; 
    347      
    348     $response = $UA->request( $request ); 
    349     return undef unless defined $response; 
    350     return undef unless $response->is_success; 
    351       
    352     return $response; 
    353 } 
    354  
    355 sub OpenOutFile 
    356 { 
    357     $BucketNum ++; 
    358     my $fname = sprintf("PageBucket.%05d",$BucketNum); 
    359     open(PAGEBUCKET,">>$fname") or die; 
    360     binmode(PAGEBUCKET); 
    361     $| = 1, select $_ for select PAGEBUCKET; 
    362 } 
    363  
    364 sub SavePage 
    365 { 
    366     my($urlR,$contR) = @_; 
    367     my $data = compress($$contR); 
    368     my $len = pack('I',length($$urlR)); 
    369     print PAGEBUCKET $len; 
    370     print PAGEBUCKET $$urlR; 
    371     $len = pack('I',length($data)); 
    372     print PAGEBUCKET $len; 
    373     print PAGEBUCKET $data; 
    374  
    375     $pageNum++;
    376     if($pageNum % 1000 == 0){ 
    377         print "$pageNum pages have been crawled!\n"; 
    378     } 
    379     if($pageNum % 100000 == 0){ 
    380         &OpenOutFile; 
    381     } 
    382 }
    点开查看代码
  • 相关阅读:
    绝对定位和相对定位的内幕
    水平居中和垂直居中
    玩转html5<canvas>画图
    基本排序算法
    很好用的canvas
    IE浏览器存在的setAttribute bug
    js 高程 函数节流 throttle() 分析与优化
    js apply()、call() 使用参考
    js 高程 22.1.4 函数绑定 bind() 封装分析
    事件处理程序中 this 的指向
  • 原文地址:https://www.cnblogs.com/xiaoCon/p/3080006.html
Copyright © 2011-2022 走看看