zoukankan      html  css  js  c++  java
  • windows版爬取csdn

    use  LWP::UserAgent;
    use POSIX;
    use HTML::TreeBuilder::XPath; 
    use Encode; 
    use HTML::TreeBuilder;
    
    open DATAFH,">csdn.html" || die "open csdn file failed:$!"; 
    my $ua = LWP::UserAgent->new;
    $ua->timeout(10);
    $ua->env_proxy;
    $ua->agent("Mozilla/8.0");
    my $response = $ua->get('http://blog.csdn.net/zhaoyangjian724');
    my $base_dir="F:\pa";
    
    
    if ($response->is_success) {
     print DATAFH  $response->content
     };
     
    use HTML::TreeBuilder::XPath;
      my $tree= HTML::TreeBuilder::XPath->new;
      $tree->parse_file( "csdn.html");
      ##     <a href="/zhaoyangjian724/article/category/1756569" onclick="_gaq.push(['_trackEvent','function', 'onclick', 'blog_articles_wenzhangfenlei']); ">Oracle dump解析
      ##获取博客分类的URL,根据a标签查找
      @Links = $tree->find_by_tag_name('a');
            foreach (@Links) {
                    $href = $_->attr('href');
     ###获取博客每个类别的url
     ####@href 表示所有分类的url
                     if ($href =~/category/){print "$href is $href
    ";
                     push (@href,$href);
                        };
    					};
     #@href 是所有类别url的汇总
     print "@href is @href
    ";
     #@type 是类别名称汇总,根据ul标签查找/li/a对应的值
     my @type=$tree->findvalues( '/html/body//ul[@class="panel_body"]/li/a');
     #my @type=encode("gbk", decode("utf8","@type"));
     foreach  (@type){
      my $a=encode("gbk", decode("utf8","$_")) ;
      push (@a, $a);
      };
     my  @type=@a;
     print "@type is @type
    ";
     my $length=@href;
     my @tmp=();
     ##@type 表示所有分类的名称
     for ($i=0;$i<$length;$i++){
        
         print "$href[$i]===$type[$i]
    ";
         push (@tmp,$type[$i])};
    	 #循环类别开始
    	for ($i=0;$i<=@type - 1; $i++){
    	print "$type is $type
    ";
                         #next  unless ($type[$i]) ;
                       if (! -d "$type[$i]"){
                        mkdir $type[$i];
                                    };
                          chdir "$base_dir/$type[$i]";
                         					 
                     ##进入每个分类版块url
                     
                        my  $pageString;
                       my $response = $ua->get("http://blog.csdn.net$href[$i]");
                        ##每个版块首页url
    					print "$href[$i] is $href[$i]
    ";
    				    ##fh1.html每个版块首页url
                        open fh1,">fh1.html" || die "open csdn file failed:$!";
                        print fh1  $response->content;
                        close fh1;
                         my $tree= HTML::TreeBuilder::XPath->new;
                         $tree->parse_file( "fh1.html");
                       ##获取每个版块的页码数 这个方法有问题,这里是数组$_ is  150条数据 共8页
                       my  @pageString = $tree->findvalues('/html/body//div[@id="papelist"]/span');
                    if ($pageString[0]){ if ($pageString[0] =~ /.*s+.*?(d+).*/){$pageString=$1}; };
    				print "@pageString is @pageString
    ;";
    				   ##获取$pageString
    				   sleep (5);
                       unless ($pageString){$pageString=1};
                        print "$pageString is $pageString
    ";
                        sleep(5);
                         ##进入每页,处理url
                         for ($j=1;$j<=$pageString + 0; $j++){
    					 ##每个类别对应的url
                          my $url="http://blog.csdn.net$href[$i]/$j";
                          print "$url is $url
    ";
                          my $response = $ua->get("$url");
    					  ##fh2 每页url
                          open fh2,">fh2.html" || die "open csdn file failed:$!";
                           print fh2  $response->content;
                          close fh2;
                          #获取每页都多少条标题
                         my @pageTitles="";
                          my $tree= HTML::TreeBuilder::XPath->new;
                         $tree->parse_file( "fh2.html");
                         #获取标题,这里会拿到除了该类别下文章外,
                         my @pageTitles = $tree->findvalues('/html/body//span[@class="link_title"]');
    					 my @a=();
    					  foreach  (@pageTitles){
                         my $a=encode("gbk", decode("utf8","$_")) ;
                         push (@a, $a);
                            };
    				     my @pageTitles=@a;
                          print "$pageTitles[0] is  $pageTitles[0]
    ";
    					  print "@pageTitles is @pageTitles
    ";
    					  sleep (10);
    					  
                         ##获取标题连接url
                           my $tree= HTML::TreeBuilder::XPath->new;
                          $tree->parse_file( "fh2.html");
    					  @titleLinks="";
                          @titleLinks=$tree->find_by_tag_name('a');
                          @urlall=""; 
                          @urltmp="";
                          #@urlall除了包含每个类别的文章,还包含阅读排行里的文章
                          foreach (@titleLinks) {
                                                   @titleHref = $_->attr('href');
                                                   foreach (@titleHref) {
    											   ###获取版块中每个页面的url
                                                   if ($_ =~/zhaoyangjian724/article/details/(d+)$/){
                                                     unless ($_ ~~ @urlall) { print "$_=========$_
    ";push (@urlall ,encode("gbk", decode("utf8","$_")));}}
                                                         };
                                               };
    										   ##第一个元素为空 需要去掉
    										  shift @urlall;
                                              print "@urlall is @urlall
    ";
    										  sleep (10);
                                              for ($k=0;$k<=@pageTitles - 1;$k++){
                                              print "$urlall[$k] is $urlall[$k]
    ";
    										  
                                              push (@urltmp,$urlall[$k]);
                                                                               };     
                                                @urlall=@urltmp;
    											shift @urlall;
    											
                                              print "$---urlall[0] is  $urlall[0]
    ";
    										  sleep (10);
    					for ($m=0;$m<=@urlall - 1; $m++){
    					$pageTitles[$m] =~ s/s+//g;
    					print "===========================
    ";
    					print "$pageTitles[$m]======$urlall[$m]
    ";
    				    print "===========================
    ";
    		         			 open fh3,">$pageTitles[$m].html" || die "open csdn file failed:$!";
    						  my $response = $ua->get("http://blog.csdn.net$urlall[$m]");
                                                          
                                                      print "--------------------------------
    ";
                                                      print "$urlall[$m]"."
    ";
                                                      print fh3  $response->content;
                                                      close fh3;
    												
    
                                                     
    										# unlink("$pageTitles[$m].html.tmp");
                               #循环页码结束
                                                        }; 
    													
    													
    							#循环每个分类的url结束
                                         
    									 
    									 
    						#循环单个类别结束
    						
                              }
    						  
    						 chdir "$base_dir";  
                      
    				      }

  • 相关阅读:
    日志记录的作用和方法 java
    log4j自动加载原理
    java反射中的动态代理机制(有实例)
    万能的ctrl+shift+F(Element 'beans' cannot have character [children], because the type's content type is element-only.错误)
    Spring的@ModelAttribute注解
    Spring MVC 的@RequestParam注解和request.getParameter("XXX")
    SpringMVC 示例实战教程
    ecshop教程:重置后台密码MD5+salt
    github常见操作和常见错误!错误提示:fatal: remote origin already exists.
    ECShop函数列表大全
  • 原文地址:https://www.cnblogs.com/zhaoyangjian724/p/6199029.html
Copyright © 2011-2022 走看看