zoukankan      html  css  js  c++  java
  • perl 爬取同花顺数据

    use  LWP::UserAgent;
    use utf8;
    use DBI;  
    $user="root";  
    $passwd='xxx';  
    $dbh="";  
    $dbh = DBI->connect("dbi:mysql:database=zjzc_vote;host=14.5.5.57;port=3306",$user,$passwd) or die "can't connect to  database ". DBI-errstr;  
    $dbh->do("SET NAMES utf8"); 
    use POSIX;
    use Data::Dumper;
    use HTML::TreeBuilder;
    open DATAFH,">data.html" || die "open data file failed:$!"; 
    my $ua = LWP::UserAgent->new;
    $ua->timeout(10);
    $ua->env_proxy;
    $ua->agent("Mozilla/8.0");
    my $response = $ua->get('http://data.10jqka.com.cn/financial/yjyg/');
    
    
    
    if ($response->is_success) {
     print DATAFH  $response->decoded_content;  # or whatever
    # print   $response->decoded_content;  # or whatever
      use HTML::TreeBuilder::XPath;
       $tree= HTML::TreeBuilder::XPath->new;
      $tree->parse_file( "data.html");
    };
    
    my $title=  $tree->findvalue('/html/body//span[@class="text-value"]');
    print "$title is $title
    ";
    
    my    @pages=$tree->find_by_tag_name('a');
                          #@urlall除了包含每个类别的文章,还包含阅读排行里的文章
                          foreach (@pages) {
                                                   @titlepage = $_->attr('page');
                                                   foreach (@titlepage) {
                                                     if ($_){ 
                                                     if ( $_ > $max ){
                                                       $max=$_;
    							};				   ###获取版块中每个页面的url
                                                         };
                                               };
    };
    print "$max is $max
    ";
    
    for ($m=1;$m<=$max; $m++){
    
    my @arr1= $tree->find_by_tag_name("tr") ;
    shift @arr1;
    foreach my $row ( @arr1) {
       my @arr2= $row->content_list;
        
        my $str1= $arr2[0]->as_text;   
        my $str2= $arr2[1]->as_text;   
        my $str3= $arr2[2]->as_text;   
        my $str4= $arr2[3]->as_text;   
        my $str5= $arr2[4]->as_text;   
        my $str6= $arr2[5]->as_text;   
        my $str7= $arr2[6]->as_text;   
        my $str8= $arr2[7]->as_text;   
        print $str1, $str2, $str3, $str4, $str5, $str6, $str7,$str8."
    ";
       open( E, ">>", "$title-$m.txt" );
          print E ($str1."|".$str2."|".$str3."|".$str4."|".$str5."|".$str6."|".$str7."|".$str8."
    ");
          close E; 
    
                      }
        }

  • 相关阅读:
    HDFS架构原理
    Hadoop集群搭建
    解决8080端口号占用问题
    基于SSM的Maven项目(Redis和Mysql)配置文件整合
    maven 集成SSM项目配置文件模版
    初识Spring笔记
    初识Mybatis一些总结
    将对数据库的增删改查封装为方法
    10分钟安装Elasticsearch
    ThreadLocal详解
  • 原文地址:https://www.cnblogs.com/zhaoyangjian724/p/6200219.html
Copyright © 2011-2022 走看看