zoukankan      html  css  js  c++  java
  • Perl 关键词搜索机器人

    这段代码在网上找的。觉得很不错,准备弄来分析下。。

    看别人的代码也是一种另类的学习方法。在学习的过程当中多看别人的代码能够提升自己的理解。

    特别是一些自己没有用过的模块,通过这些实例就能知道怎么去使用。

    当然,你也可以自己去研究官方那些文档。但是对于我来说,我觉得最快的方法就是看别人写的代码实例。

    或许每个人都有点不同吧。

    #!/usr/bin/perl
     # siteindexingbot.pl
     use warnings;
     use strict;
    
     use LWP::Simple;
     use LWP::RobotUA;
     use WWW::RobotRules;
     use HTML::Parse;
     use HTML::HeadParser;
     use URI::URL;
    
     my ($response, $tree, $link, %scanned);
    
     # the arrays and hashes used to store page data
     my (@pages, %titles, %keywords);
    
     my $url = $ARGV[0] or die "Usage: siteindexingbot [url]\n";
     my $base_url = &globalize_url('/', $url);
     my $robots_txt = $base_url . '/robots.txt';
    
     my $robot_rules = new WWW::RobotRules (
        "indexifier/1.0 (libwww-perl-$LWP::VERSION)"
     );
    
     # look for and parse the robots.txt file
     if (head($robots_txt)) {
        print "robots.txt file found OK.\n";
        $robot_rules->parse($robots_txt, get($robots_txt));
     } else {
        print "robots.txt file not found.\n";
     }
    
     # build the user agent
     my $ua = new LWP::UserAgent (
        "indexifier/1.0 (libwww-perl-$LWP::VERSION)",
        'me@here.com',
        $robot_rules
     );
    
     #$ua->proxy('http' => 'http://proxy.mylan.com/' );
     $ua->timeout(30);
     $ua->max_size(1024 * 100);
     $ua->parse_head('TRUE');
    
     &scan($base_url);
    
     open (FILE, ">indexed.txt") or die "Opening indexed.txt: $!";
     foreach my $page(@pages) {
        print FILE join( "\t",
        ($page, $titles{$page}, $keywords{$page})
        ), "\n";
     }
     close (FILE);
    
     exit;
    
     sub scan {
        my $url = shift;
        print "Scanning '$url':\n";
        if ($scanned{$url}) {
           return;
        } else {
           &get_info($url);   # this is the extra subroutine
           $scanned{$url} = 'TRUE';
           my @links = &get_links($url);
           foreach $link(@links) {
              if ($robot_rules->allowed($link)) {
                 if ($link =~ /^$base_url/i) {
                    my $request = HTTP::Request->new ('HEAD' => $link);
                    my $response = $ua->request($request);
                    my $content_type = $response->header('Content-type');
                    if ($response->is_error) {
                       print "Dead link to $link found on $url\n";
                    } else {
                       print "$url links to $link\n";
                       if ($content_type eq 'text/html') {
                          &scan($link);
                       } else {
                          print "$link is not HTML\n";
                       }
                    }
                 } else {
                    print "$link is not local to $base_url\n";
                 }
              } else {
                 print "Access to $link is not allowed by robots.txt\n";
              }
           }
        }
     return;
     }
    
     sub globalize_url {
        my ($link, $referring_url) = @_;
        my $url_obj = new URI::URL($link, $referring_url);
        my $absolute_url = $url_obj->abs->as_string;
        $absolute_url =~ s/^(.+?)#(.+?)$/$1/ig;
        return $absolute_url;
     }
    
     sub get_links {
        my $url = shift;
        my $request = HTTP::Request->new ('GET' => $url);
        $request->header('Accept' => 'text/html');
        my $response = $ua->request($request);
        my $tree = HTML::Parse::parse_html($response->content);
        my $links_ref = $tree->extract_links('a', 'frame', 'iframe');
        my @links;
        foreach $link(sort @$links_ref) {
           push(@links, &globalize_url(${$link}[0], $url));
        }
     return @links;
     }
    
     sub get_info {
        my $url = shift;
        my $request = HTTP::Request->new('GET' => $url);
        $request->header('Accept' => 'text/html');
        my $response = $ua->request($request);
        my $html = $response->content;
        my ($title, $keywords, $type);
        my $parser = HTML::HeadParser->new;
        $parser->parse($html);
        $title = $parser->header('title') || 'Untitled Document';
        $keywords = $response->header('X-Meta-description') || 'none';
        push (@pages, $url);
        $titles{$url} = $title;
        $keywords{$url} = $keywords;
        return;
     }
    
  • 相关阅读:
    【故障处理】ORA-12162: TNS:net service name is incorrectly specified (转)
    android studio 编程中用到的快捷键
    java时间格式串
    android Error occurred during initialization of VM Could not reserve enough space for object heap Could not create the Java virtual machine.
    linux安装vmware
    x1c 2017 安装mint18的坑——grub2
    x1c2017 8G版 win linux的取舍纠结记录
    python的try finally (还真不简单)
    kafka+docker+python
    json文件不能有注释
  • 原文地址:https://www.cnblogs.com/xiaoCon/p/2934512.html
Copyright © 2011-2022 走看看