zoukankan      html  css  js  c++  java
  • Perl6 必应抓取(2):最终版

    use HTTP::UserAgent;
    use URI::Encode;
    
    my $ua = HTTP::UserAgent.new(:user-agent<Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0>);
    my $bing_url = 'http://cn.bing.com/search?q=';
    my $choose = rx/'<cite>'(.*?)'</cite>'/;#要查的内容
    my $filename = ~now.DateTime~'.txt';
    $filename = do given $filename {S:g/':'/-/};
    my $fp = open $filename, :w;
    my $hv;
    my $all_data; #进度显示
    
    sub MAIN (Int $page_number) {
        say '+';
        say '=======================================================================';
        say '                  By: FireC@t';
        say '=======================================================================';
        #say '';
        my $strings = prompt 'Input String You Want: ';
        say 'Search : '~$strings;
            $all_data = 10*$page_number;
        say 'Data count(10*'~$page_number~') : '~$all_data; #输出数据数目
        my $start_time = now.DateTime;
        say 'Start Time : '~$start_time;
        say '=======================================================================';
        $strings = uri_encode($strings);
        my $count = 0;
        for 1..$page_number {
              #每一页的结果调用函数
              my $url_end = '&first='~$count;
              my $targeturl = $bing_url~$strings~$url_end;
              #say $targeturl;
              #调用函数查询结果URL
              Bing_search($targeturl);
              $count += 10;
        }
        my $end_time = now.DateTime;
        say '=======================================================================';
        say 'Finish Time : '~$end_time;
        say 'Time Use : '~($end_time-$start_time);
        say '=======================================================================';
        say 'Data save to : '~$filename;
        say '=======================================================================';
    }
    
    #查询函数
    sub Bing_search($url) {
        my $html = $ua.get($url).content;#获取结果
        loop {
              $html ~~ $choose;
              last if not $0;
    
              my $swap_ = ~$0;
              $html = $/.postmatch;
              $swap_ = do given $swap_ {S:g/'<strong>'//};
              $swap_ = do given $swap_ {S:g/'</strong>'//};
              say '('~$hv~':'~$all_data~')'~$swap_;
                        $hv++;
              $fp.say($swap_);
        }
    }

    说明, 在dos下输入中文, 因为终端编码问题, 程序会报错。

    在linux下运行正常, 或dos下设置编为utf8。

    用法:

    > perl6 bing_s.p6 10
    
    这里的参数 10为页数, 可随意更改。

    BUG:

      如果bing中的结果只有 100 条, 而我们向他取 1000 条, 这时我们会取到相同的数据。

    修复:

      在运行前, 用bing的数据库条目与用户输入的对比。 如果用户请求数目超出bing现有数目, 取bing最大值代替用户输入的最大值。

    update: 2017/08/25

    修复后代码:

    use HTTP::UserAgent;
    use URI::Encode;
    
    =begin pod
    用于国内版bing查询
    # by FireC@t
    # 2017/08/25
    =end pod
    
    
    
    
    my $ua = HTTP::UserAgent.new(:user-agent<Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0>);
    my $bing_url = 'http://cn.bing.com/search?q=';
    my $choose = rx/'<cite>'(.*?)'</cite>'/;#要查的内容
    my $filename = ~now.DateTime~'.txt';
    $filename = do given $filename {S:g/':'/-/};
    my $fp = open $filename, :w;
    my $hv=1;
    my $all_data; #进度显示
    
    sub MAIN (Int $page_number) {
        say '+';
        say '=======================================================================';
        say '                  By: FireC@t';
        say '=======================================================================';
        #say '';
        my $strings = prompt 'Input String You Want: ';
        say 'Search : '~$strings;
        say 'User Data  Get Page : '~$page_number; #输出数据数目
        my $start_time = now.DateTime;
    
        $strings = uri_encode($strings);
            #调用用户处理函数, 处理记录数, 防止重复
    
            $all_data = 10*$page_number;#先计算用户实际要求的数目
            #如果用户请求数据过多, 提示
            #say 'Test all_data ->'~$all_data; # for test;
            my $page_number_swap = User_data_chang($strings);
            if $page_number_swap < $page_number {
                        say 'Not enough Data for bing('~$page_number_swap~' pages), page_number change: '~$page_number~' to '~$page_number_swap;
                        #改写用户要求的实际数目
                        $all_data = 10*$page_number_swap;
            }
    
        my $count = 0;
            say 'Start Time : '~$start_time;
            say '=======================================================================';
            #sleep(100);
        for 1..$page_number_swap {
              #每一页的结果调用函数
              my $url_end = '&first='~$count;
              my $targeturl = $bing_url~$strings~$url_end;
              say '>> '~$targeturl~'';
              #调用函数查询结果URL
              Bing_search($targeturl);
              $count += 10;
        }
        my    $end_time = now.DateTime;
        say '=======================================================================';
        say 'Finish Time : '~$end_time;
        say 'Time Use : '~($end_time-$start_time);
        say '=======================================================================';
            say 'Data('~$hv-1~' lines) save to : '~$filename;
            say '=======================================================================';
    }
    
    #查询函数
    sub Bing_search($url) {
        my $html = $ua.get($url).content;#获取结果
        loop {
              $html ~~ $choose;
                        last if not $0;
    
              my $swap_ = ~$0;
              $html = $/.postmatch;
              $swap_ = do given $swap_ {S:g/'<strong>'//};
              $swap_ = do given $swap_ {S:g/'</strong>'//};
              say '('~$hv~':'~$all_data~') '~$swap_;
                        $hv++; #记录数据数目
              $fp.say($swap_);
        }
    }
    
    #用于处理用户请求记录数
    sub User_data_chang($strings){
            #获取所有记录数:
            my $start_url = $bing_url~$strings;
            my $all_result_number = $ua.get($start_url);
            $all_result_number ~~ /'sb_count">'(.*?)s.*?'</span>'/;
            if not  $0 {
                        #say 'Not Result';
                        return 0;
                        #没有结果, 直接返回0个页面
            }
            #如果有结果
            my $data_number = ~$0;#123,45
            #say $data_number; #test
            my $bing_all_data = Int($data_number.subst: /','/,'',:g); #获得结果总数
            #test
            #say $bing_all_data;
            #say $all_data;
    
            #$all_data为用户请求总数
            if $all_data > $bing_all_data {
                        #如果用户请求数大于数据已有数目, 那就返回所有请求
                        #调用分页函数返回一共有多少页
                        my $user_page = User_page($bing_all_data);
                        #say 'return page:'~$user_page;    sleep(1000);
                        return $user_page;#返回页数
    
            }else {
                        #否则返回用户自定义页数
                        my $user_page = User_page($all_data);
                        #say 'return page:'~$user_page;    sleep(1000);
                        return $user_page;
            }
    }
    
    
    #用于处理页数
    sub User_page($data_number) {
                my $page_check = ~($data_number/10);
                if $page_check.split('.').elems == 2 {
                        #说明有小数
                        return $page_check.split('.')[0] + 1;
                }else {
                        #没小数, 取整数
                        return $page_check.split('.')[0];
                }
    }
  • 相关阅读:
    跨域解决方法
    css之line-height
    untiy项目中使用MD5加密
    unity给子物体添加Shader
    unity中UI坐标转3d世界坐标
    unity项目字符串转为Vector3和Quaternion
    unity中使用Highlighting System v4.0插件给物体添加高亮
    加载AssetBundle方法
    Lua面向对象----类、继承、多继承、单例的实现
    Lua学习笔记(一)-----C#和lua的交互
  • 原文地址:https://www.cnblogs.com/perl6/p/7426189.html
Copyright © 2011-2022 走看看