zoukankan      html  css  js  c++  java
  • perl 爬取上市公司业绩预告

    <pre name="code" class="python">use  LWP::UserAgent;
    use utf8;
    use DBI;  
    use POSIX;
    use Data::Dumper;
    use HTML::TreeBuilder;
      use HTML::TreeBuilder::XPath;
    my $ua = LWP::UserAgent->new;
    $ua->timeout(10);
    $ua->env_proxy;
    $ua->agent("Mozilla/8.0");
    #my $response = $ua->get('http://data.10jqka.com.cn/financial/yjyg/date/2016-12-31/board/ALL/field/enddate/order/desc/page/1/ajax/1/');
    #my $response = $ua->get('http://data.10jqka.com.cn/financial/yjyg/');
    my @array=('2016-12-31','2016-03-31','2015-12-31','2015-09-30','2015-06-30','2015-03-31','2014-12-31','2014-09-30','2014-03-31');
    
    foreach (@array){
    print "$_ is $_
    ";
    my $url="http://data.10jqka.com.cn/financial/yjyg/date/$_/board/ALL/field/enddate/order/desc/page/1/ajax/1/";
    print "$url is $url
    ";
    my $response = $ua->get($url);
    if ($response->is_success) {
    open DATAFH,">data.html" || die "open data file failed:$!"; 
    print DATAFH "<html>";
    print 	DATAFH "
    ";
    print DATAFH  $response->decoded_content;  # or whatever
    print DATAFH "</html>";
    print DATAFH "
    ";
    };
    close DATAFH;
    unlink("ths.html"); 
    system('cp data.html ths.html');
    $tree= HTML::TreeBuilder::XPath->new;
    $tree->parse_file( "ths.html");
    
    my $title="$_";
    #my $title=  $tree->findvalue('/html/body//span[@class="text-value"]');
    print "$title is $title
    ";
    my @pages="";
    my @titlepage="";
    $max="";
    my    @pages=$tree->find_by_tag_name('a');
    print "@pages is @pages
    ";
                          #@urlall除了包含每个类别的文章,还包含阅读排行里的文章
                          foreach (@pages) {
                                                   @titlepage = $_->attr('page');
                                                   foreach (@titlepage) {
                                                     if ($_){ 
                                                     if ( $_ > $max ){
                                                       $max=$_;
    							};				   ###获取版块中每个页面的url
                                                         };
                                               };
    };
    unless ($max){$max=1};
    print "$max is $max
    ";
    sleep (5);
    
    for ($m=1;$m<=$max; $m++){
    
    my $url="http://data.10jqka.com.cn/financial/yjyg/date/$_/board/ALL/field/enddate/order/desc/page/$m/ajax/1/";
    my $response = $ua->get("$url");
    if ($response->is_success) {
    open DATAFH,">data.html" || die "open data file failed:$!";
    print DATAFH "<html>";
    print   DATAFH "
    ";
    print DATAFH  $response->decoded_content;  # or whatever
    print DATAFH "</html>";
    print DATAFH "
    ";
    close DATATH;
    };
    unlink("ths.html");
    system('cp data.html ths.html');
    $tree= HTML::TreeBuilder::XPath->new;
    $tree->parse_file( "ths.html");
    
    my @arr1= $tree->find_by_tag_name("tr") ;
    #shift @arr1;
    foreach my $row ( @arr1) {
       my @arr2= $row->content_list;
        
        my $str1= $arr2[0]->as_text;   
        my $str2= $arr2[1]->as_text;   
        my $str3= $arr2[2]->as_text;   
        my $str4= $arr2[3]->as_text;   
        my $str5= $arr2[4]->as_text;   
        my $str6= $arr2[5]->as_text;   
        my $str7= $arr2[6]->as_text;   
        my $str8= $arr2[7]->as_text;   
        print $str1, $str2, $str3, $str4, $str5, $str6, $str7,$str8."
    ";
       open( E, ">>", "$title-$m.txt" );
          print E ($str1."|".$str2."|".$str3."|".$str4."|".$str5."|".$str6."|".$str7."|".$str8."
    ");
          close E; 
    
                      }
        }
    } 


    
                                        
    
  • 相关阅读:
    PAT:循环-12. 打印九九口诀表(15) AC
    PAT:循环-07. 爬动的蠕虫(15) 错两个
    PAT:循环-01. 求整数段和(15) AC
    PAT:分支-16. 计算分段函数(10) AC
    PAT:分支-10. 计算个人所得税(10) AC
    PAT:分支-08. 高速公路超速处罚(15) AC
    UIToolBar
    iOS 代码实现获得应用的版本号(Version/Build)
    IOS开发之 ---- iOS8中提示框的使用UIAlertController(UIAlertView和UIActionSheet二合一)
    IOS开发中的CGFloat、CGPoint、CGSize和CGRect
  • 原文地址:https://www.cnblogs.com/hzcya1995/p/13350887.html
Copyright © 2011-2022 走看看