这段代码在网上找的。觉得很不错,准备弄来分析下。。
看别人的代码也是一种另类的学习方法。在学习的过程当中多看别人的代码能够提升自己的理解。
特别是一些自己没有用过的模块,通过这些实例就能知道怎么去使用。
当然,你也可以自己去研究官方那些文档。但是对于我来说,我觉得最快的方法就是看别人写的代码实例。
或许每个人都有点不同吧。
#!/usr/bin/perl # siteindexingbot.pl use warnings; use strict; use LWP::Simple; use LWP::RobotUA; use WWW::RobotRules; use HTML::Parse; use HTML::HeadParser; use URI::URL; my ($response, $tree, $link, %scanned); # the arrays and hashes used to store page data my (@pages, %titles, %keywords); my $url = $ARGV[0] or die "Usage: siteindexingbot [url]\n"; my $base_url = &globalize_url('/', $url); my $robots_txt = $base_url . '/robots.txt'; my $robot_rules = new WWW::RobotRules ( "indexifier/1.0 (libwww-perl-$LWP::VERSION)" ); # look for and parse the robots.txt file if (head($robots_txt)) { print "robots.txt file found OK.\n"; $robot_rules->parse($robots_txt, get($robots_txt)); } else { print "robots.txt file not found.\n"; } # build the user agent my $ua = new LWP::UserAgent ( "indexifier/1.0 (libwww-perl-$LWP::VERSION)", 'me@here.com', $robot_rules ); #$ua->proxy('http' => 'http://proxy.mylan.com/' ); $ua->timeout(30); $ua->max_size(1024 * 100); $ua->parse_head('TRUE'); &scan($base_url); open (FILE, ">indexed.txt") or die "Opening indexed.txt: $!"; foreach my $page(@pages) { print FILE join( "\t", ($page, $titles{$page}, $keywords{$page}) ), "\n"; } close (FILE); exit; sub scan { my $url = shift; print "Scanning '$url':\n"; if ($scanned{$url}) { return; } else { &get_info($url); # this is the extra subroutine $scanned{$url} = 'TRUE'; my @links = &get_links($url); foreach $link(@links) { if ($robot_rules->allowed($link)) { if ($link =~ /^$base_url/i) { my $request = HTTP::Request->new ('HEAD' => $link); my $response = $ua->request($request); my $content_type = $response->header('Content-type'); if ($response->is_error) { print "Dead link to $link found on $url\n"; } else { print "$url links to $link\n"; if ($content_type eq 'text/html') { &scan($link); } else { print "$link is not HTML\n"; } } } else { print "$link is not local to $base_url\n"; } } else { print "Access to $link is not allowed by robots.txt\n"; } } } return; } sub globalize_url { my ($link, $referring_url) = @_; my $url_obj = new URI::URL($link, $referring_url); my $absolute_url = $url_obj->abs->as_string; $absolute_url =~ s/^(.+?)#(.+?)$/$1/ig; return $absolute_url; } sub get_links { my $url = shift; my $request = HTTP::Request->new ('GET' => $url); $request->header('Accept' => 'text/html'); my $response = $ua->request($request); my $tree = HTML::Parse::parse_html($response->content); my $links_ref = $tree->extract_links('a', 'frame', 'iframe'); my @links; foreach $link(sort @$links_ref) { push(@links, &globalize_url(${$link}[0], $url)); } return @links; } sub get_info { my $url = shift; my $request = HTTP::Request->new('GET' => $url); $request->header('Accept' => 'text/html'); my $response = $ua->request($request); my $html = $response->content; my ($title, $keywords, $type); my $parser = HTML::HeadParser->new; $parser->parse($html); $title = $parser->header('title') || 'Untitled Document'; $keywords = $response->header('X-Meta-description') || 'none'; push (@pages, $url); $titles{$url} = $title; $keywords{$url} = $keywords; return; }