zoukankan      html  css  js  c++  java
  • perl 爬取csdn

    <pre name="code" class="python">use  LWP::UserAgent;
    use POSIX;
    use HTML::TreeBuilder::XPath;
    use DBI;  
    use Encode; 
    use utf8;
    use HTML::TreeBuilder;
    open DATAFH,">csdn.html" || die "open csdn file failed:$!"; 
    my $ua = LWP::UserAgent->new;
    $ua->timeout(10);
    $ua->env_proxy;
    $ua->agent("Mozilla/8.0");
    $base_dir='/root/lwp';
    my $response = $ua->get('http://blog.csdn.net/zhaoyangjian724');
    
    
    if ($response->is_success) {
     print DATAFH  $response->decoded_content;  # or whatever
    # print   $response->decoded_content;  # or whatever
      use HTML::TreeBuilder::XPath;
      my $tree= HTML::TreeBuilder::XPath->new;
      $tree->parse_file( "csdn.html");
    ## 按link_view
     ###获取每个类别的url
    @Links = $tree->find_by_tag_name('a');
            foreach (@Links) {
                    $href = $_->attr('href');
     ###获取每个类别的url
                     if ($href =~/category/){print "$href is $href
    ";
                     push (@href,$href);
                        };
                      
                    
                    };
    #@href 是所有类别url的汇总
    print "@href is @href
    ";
    
    #@type 是类别名称汇总
     my @type=$tree->findvalues( '/html/body//ul[@class="panel_body"]/li/a');
    print "@type is @type
    ";
    my $length=@href;
    @tmp="";
    for ($i=0;$i<$length;$i++){print "$href[$i]===$type[$i]
    ";
                               push (@tmp,$type[$i])};
    ##此时@tmp表示Oracle dump解析 类别开始到监控平台项目结束
    shift @tmp;
    @type=@tmp;
    print "@type is @type
    ";
    #循环类别开始
    	for ($i=0;$i<=@type - 1; $i++){
    	print "$type is $type
    ";
                         #next  unless ($type[$i]) ;
                       if (! -d "$type[$i]"){
                        mkdir $type[$i];
                                    };
                          chdir "$base_dir/$type[$i]";
                          system(pwd);
                          sleep (5);					 
                     ##进入每个分类版块url
                     
                        my  $pageString;
                       my $response = $ua->get("http://blog.csdn.net$href[$i]");
                        ##每个版块首页url
    					print "$href[$i] is $href[$i]
    ";
    				    ##fh1.html每个版块首页url
                        open fh1,">fh1.html" || die "open csdn file failed:$!";
                        print fh1  $response->decoded_content;
                        close fh1;
                         my $tree= HTML::TreeBuilder::XPath->new;
                         $tree->parse_file( "fh1.html");
                       ##获取每个版块的页码数 这个方法有问题,这里是数组$_ is  150条数据 共8页
                       my  @pageString = $tree->findvalues('/html/body//div[@id="papelist"]/span');
                    if ($pageString[0]){ if ($pageString[0] =~ /.*s+.*?(d+).*/){$pageString=$1}; };
    				print "@pageString is @pageString
    ;";
    				   ##获取$pageString
    				   sleep (5);
                       unless ($pageString){$pageString=1};
                        print "$pageString is $pageString
    ";
                        sleep(5);
                         ##进入每页,处理url
                         for ($j=1;$j<=$pageString + 0; $j++){
    					 ##每个类别对应的url
                          my $url="http://blog.csdn.net$href[$i]/$j";
                          print "$url is $url
    ";
                          my $response = $ua->get("$url");
    					  ##fh2 每页url
                          open fh2,">fh2.html" || die "open csdn file failed:$!";
                           print fh2  $response->decoded_content;
                          close fh2;
                          #获取每页都多少条标题
                         my @pageTitles="";
                          my $tree= HTML::TreeBuilder::XPath->new;
                         $tree->parse_file( "fh2.html");
                         #获取标题,这里会拿到除了该类别下文章外,
                         my @pageTitles = $tree->findvalues('/html/body//span[@class="link_title"]');
                          print "$pageTitles[0] is  $pageTitles[0]
    ";
    					  print "@pageTitles is @pageTitles
    ";
    					  sleep (10);
    					  
                         ##获取标题连接url
                           my $tree= HTML::TreeBuilder::XPath->new;
                          $tree->parse_file( "fh2.html");
    					  @titleLinks="";
                          @titleLinks=$tree->find_by_tag_name('a');
                          @urlall=""; 
                          @urltmp="";
                          #@urlall除了包含每个类别的文章,还包含阅读排行里的文章
                          foreach (@titleLinks) {
                                                   @titleHref = $_->attr('href');
                                                   foreach (@titleHref) {
    											   ###获取版块中每个页面的url
                                                   if ($_ =~/zhaoyangjian724/article/details/(d+)$/){
                                                     unless ($_ ~~ @urlall) { print "$_=========$_
    ";push (@urlall ,$_);}}
                                                         };
                                               };
    										   ##第一个元素为空 需要去掉
    										   shift @urlall;
                                              print "@urlall is @urlall
    ";
    										  sleep (10);
                                              for ($k=0;$k<=@pageTitles - 1;$k++){
                                              print "$urlall[$k] is $urlall[$k]
    ";
    										  
                                              push (@urltmp,$urlall[$k]);
                                                                               };     
                                                @urlall=@urltmp;
    											shift @urlall;
    											
                                              print "$---urlall[0] is  $urlall[0]
    ";
    										  sleep (10);
    					for ($m=0;$m<=@urlall - 1; $m++){
    					$pageTitles[$m] =~ s/s+//g;
    					print "===========================
    ";
    					print "$pageTitles[$m]======$urlall[$m]
    ";
    				    print "===========================
    ";
    		         			 open fh3,">$pageTitles[$m].html" || die "open csdn file failed:$!";
    						  my $response = $ua->get("http://blog.csdn.net$urlall[$m]");
                                                          
                                                      print "--------------------------------
    ";
                                                      print "$urlall[$m]"."
    ";
                                                      print fh3  $response->decoded_content;
                                                      close fh3;
    												
    
                                                     
    										# unlink("$pageTitles[$m].html.tmp");
                               #循环页码结束
                                                        }; 
    													
    													
    							#循环每个分类的url结束
                                         
    									 
    									 
    						#循环单个类别结束
    						
                              }
    						  
    						 chdir "$base_dir";  
                      
    				      }
    				   }
    else{print   $response->decoded_content;}
    


    
       
    
    
  • 相关阅读:
    linux 安装mysql及配置
    django restframework的应用
    python uuid的连接及简单应用
    Flink开发-Flink的计算模型和接口
    数据仓库-基本框架和内容
    数据仓库-需求沟通和开发示例
    Spark开发-开发总览
    Hive 高阶应用开发示例(二)
    Hive 高阶应用开发示例(一)
    Spark开发-关联分析
  • 原文地址:https://www.cnblogs.com/zhaoyangjian724/p/6200382.html
Copyright © 2011-2022 走看看