zoukankan      html  css  js  c++  java
  • 基于perl的网络爬虫


    use Mojo::UserAgent; use Bloom::Filter; use Smart::Comments; use DBI; my $dbname = "bbs_url"; my $location = "localhost"; my $port = "3306"; my $database = "DBI:mysql:$dbname:$location:$port"; my $db_user = "root"; my $db_pass = "toor"; my $dbh = DBI->connect($database,$db_user,$db_pass); my $dept_level = 4; my $baseUrl = Mojo::URL->new($ARGV[0] || 'http://bbs.xxxxx.cn/'); my ($domain) = $baseUrl =~ qr#http://(?:www.)?([^/]+)#; my $filter = Bloom::Filter->new(capacity => 100000, error_rate => 0.0001); my $ua = Mojo::UserAgent->new(max_redirects => 5); $name="xxxxx"; my $query = "CREATE TABLE $name("." `No` int(100) NOT NULL auto_increment,"." `depth` int(10) NOT NULL,"." `Url` text NOT NULL, PRIMARY KEY (`No`) ".") ENGINE=MyISAM DEFAULT CHARSET=utf8;"; my $sth = $dbh->prepare($query); $sth->execute() or die "create table student error: ".$sth->errstr(); my $callback;$callback = sub { my ($ua, $tx) = @_; #open(FD,">>url.txt")|| die ("Could not open file"); return if !$tx->success; my $dept = $tx->req->headers->header('dept'); return if $dept > $dept_level; ++$dept; $tx->res->dom->find("a[href]")->each(sub{ my $attrs = shift->attrs; my $newUrl = Mojo::URL->new($attrs->{href}); if (!$newUrl->host and !$newUrl->scheme) { $newUrl->host($tx->req->url->host); $newUrl->scheme($tx->req->url->scheme); } $newUrl->fragment(undef); next if ( $newUrl->scheme ne 'http' && $newUrl->scheme ne 'https' ); next if $newUrl->host !~ qr/$domain/; next if ( $newUrl->path =~ /.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf)$/i ); if( !$filter->check($newUrl) ) { if(($filter->key_count())%1000 ==0){ print $filter->key_count(), " $dept ", $newUrl, " "; } if($dept==3 || $dept ==4 || $dept ==5){ #$n++; #print FD $filter->key_count()," ",$dept," ",$newUrl," "; my $sql="insert into $name(depth,Url) values('$dept','$newUrl')"; my $sth=$dbh->prepare("$sql"); $sth->execute(); $sth->finish(); } #if($dept==4){ #$n++; #print FD $filter->key_count()," ",$dept," ",$newUrl," "; #my $sql="insert into $names(depth,Url) values('$dept','$newUrl')"; #my $sth=$dbh->prepare("$sql"); #$sth->execute(); #$sth->finish(); # } $filter->add($newUrl); $ua->get($newUrl => { dept => $dept } => $callback); } }); }; $ua->get($baseUrl => { dept => 1} => $callback); Mojo::IOLoop->start;
  • 相关阅读:
    Java实现 蓝桥杯VIP 算法训练 校门外的树
    Java实现 蓝桥杯VIP 算法训练 统计单词个数
    Java实现 蓝桥杯VIP 算法训练 统计单词个数
    Java实现 蓝桥杯VIP 算法训练 开心的金明
    Java实现 蓝桥杯VIP 算法训练 开心的金明
    Java实现 蓝桥杯 算法训练 纪念品分组
    Java实现 蓝桥杯 算法训练 纪念品分组
    Java实现 蓝桥杯VIP 算法训练 校门外的树
    Java实现 蓝桥杯VIP 算法训练 统计单词个数
    Java实现 蓝桥杯VIP 算法训练 开心的金明
  • 原文地址:https://www.cnblogs.com/huangxiaohen/p/3223923.html
Copyright © 2011-2022 走看看