php抓取当前页面中所有链接,再分析下载想要下载的链接,保存到数据库
代码
<?PHP
// GET all links from URL
function remove_html_tags(&$item, $key)
{
$item=trim(strip_tags($item));
}
function get_path($url)
{
preg_match("/^(http:\/\/)?([^\/]+)/i", $url,$matches);
return substr($url, strlen($matches[0]));
}
function get_host($url)
{
preg_match("/^(http:\/\/)?([^\/]+)/i", $url,$matches);
return $matches[2];
}
function get_all_html($url)
{
$htmlString = '';
$host = get_host($url);
$path = get_path($url);
$fp = fsockopen($host, 80, $errno, $errstr, 30);
if (! $fp)
{
echo "$errstr ($errno)<br />\n";
} else
{
$out = "GET $path HTTP/1.1\r\n";
$out .= "Host: $host \r\n";
$out .= "Connection: Close\r\n\r\n";
$myInt = fwrite($fp, $out);
while (! feof($fp))
{
$htmlString = $htmlString.fgets($fp);
}
fclose($fp);
}
return $htmlString;
}
function get_links($url)
{
$preg =
"/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"
."([^<]+|.*?)?<\/a>/i";
preg_match_all(trim($preg),
get_all_html($url), $out, PREG_PATTERN_ORDER);
$keys = $out[1];
$values = $out[2];
array_walk($values, 'remove_html_tags');
return (array_combine($keys, $values));
}
//过滤关键字 不包含特定的字符串 得到想要下载的链接
function filter_string($var)
{
$pos = strpos($var, '中日研修');//只想要 含中日研修的链接
if ($pos === false)
return false;
else
return true;
}
//保存到数据库
function save_html($title, $html)
{
$link = mysql_connect("localhost:3306","root","")OR die(mysql_error());
$res = mysql_select_db("test", $link);
$result = mysql_query("insert into save_info(title, content) values ('$title', '$html')",$link);
return $result;
}
$url = "http://www.zggjlww.cn/share/?mods=news&action=list&id=15";
$arr = get_links($url);
$newMatches = array_filter($arr,"filter_string");
//提取a 中的 href内容 合成完整的url, 连合标题保存到二维数组中
//$strLinks = implode($newMatches,'<p></p>');
//print_r($newMatches);
//foreach所操作的是指定数组的一个拷贝,而不是该数组本身。 每次指向数组的第一位
//foreach ($newMatches as $url => $site)
//{
// echo "<br /> 址址: $site, url: $url";
//}
/*
* reset($newMatches);
* while (list($key, $val) = each($newMatches))
* {
* echo "$key => $val\n";
* }
*/
reset($newMatches);
$count = 1;
while(list($key,$val) = each($newMatches))
{
$key = "http://" . get_host($url) . $key;
$html = substr(htmlentities(get_all_html($key)), 2000, 10000);//我抓的页面太大了,截断了点了才能保存到mysql
save_html($val,$html);
$rest = count($newMatches) - $count;
$count++ ;
echo "$val = > $key <br />保存成功... 还有 $rest 张页面要处理...<p></p>";
}
?>
// GET all links from URL
function remove_html_tags(&$item, $key)
{
$item=trim(strip_tags($item));
}
function get_path($url)
{
preg_match("/^(http:\/\/)?([^\/]+)/i", $url,$matches);
return substr($url, strlen($matches[0]));
}
function get_host($url)
{
preg_match("/^(http:\/\/)?([^\/]+)/i", $url,$matches);
return $matches[2];
}
function get_all_html($url)
{
$htmlString = '';
$host = get_host($url);
$path = get_path($url);
$fp = fsockopen($host, 80, $errno, $errstr, 30);
if (! $fp)
{
echo "$errstr ($errno)<br />\n";
} else
{
$out = "GET $path HTTP/1.1\r\n";
$out .= "Host: $host \r\n";
$out .= "Connection: Close\r\n\r\n";
$myInt = fwrite($fp, $out);
while (! feof($fp))
{
$htmlString = $htmlString.fgets($fp);
}
fclose($fp);
}
return $htmlString;
}
function get_links($url)
{
$preg =
"/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"
."([^<]+|.*?)?<\/a>/i";
preg_match_all(trim($preg),
get_all_html($url), $out, PREG_PATTERN_ORDER);
$keys = $out[1];
$values = $out[2];
array_walk($values, 'remove_html_tags');
return (array_combine($keys, $values));
}
//过滤关键字 不包含特定的字符串 得到想要下载的链接
function filter_string($var)
{
$pos = strpos($var, '中日研修');//只想要 含中日研修的链接
if ($pos === false)
return false;
else
return true;
}
//保存到数据库
function save_html($title, $html)
{
$link = mysql_connect("localhost:3306","root","")OR die(mysql_error());
$res = mysql_select_db("test", $link);
$result = mysql_query("insert into save_info(title, content) values ('$title', '$html')",$link);
return $result;
}
$url = "http://www.zggjlww.cn/share/?mods=news&action=list&id=15";
$arr = get_links($url);
$newMatches = array_filter($arr,"filter_string");
//提取a 中的 href内容 合成完整的url, 连合标题保存到二维数组中
//$strLinks = implode($newMatches,'<p></p>');
//print_r($newMatches);
//foreach所操作的是指定数组的一个拷贝,而不是该数组本身。 每次指向数组的第一位
//foreach ($newMatches as $url => $site)
//{
// echo "<br /> 址址: $site, url: $url";
//}
/*
* reset($newMatches);
* while (list($key, $val) = each($newMatches))
* {
* echo "$key => $val\n";
* }
*/
reset($newMatches);
$count = 1;
while(list($key,$val) = each($newMatches))
{
$key = "http://" . get_host($url) . $key;
$html = substr(htmlentities(get_all_html($key)), 2000, 10000);//我抓的页面太大了,截断了点了才能保存到mysql
save_html($val,$html);
$rest = count($newMatches) - $count;
$count++ ;
echo "$val = > $key <br />保存成功... 还有 $rest 张页面要处理...<p></p>";
}
?>
mysql数据库创建脚本
create table save_info
(
id int auto_increment not null,
title text,
content longtext,
primary key(id)
);