php简单的抓取

zoukankan html css js c++ java

php简单的抓取

php抓取当前页面中所有链接，再分析下载想要下载的链接，保存到数据库
代码
<?PHP
 // GET all links from URL

 function remove_html_tags(&$item, $key)
 {
 $item=trim(strip_tags($item));
 }
 function get_path($url)
 {
 preg_match("/^(http:\/\/)?([^\/]+)/i", $url,$matches);
 return substr($url, strlen($matches[0]));
 }
 function get_host($url)
 {
 preg_match("/^(http:\/\/)?([^\/]+)/i", $url,$matches);
 return $matches[2];
 }
 function get_all_html($url)
 {
 $htmlString = '';
 $host = get_host($url);
 $path = get_path($url);
 $fp = fsockopen($host, 80, $errno, $errstr, 30);
 if (! $fp)
 {
 echo "$errstr ($errno) \n";
 } else
 {
 $out = "GET $path HTTP/1.1\r\n";
 $out .= "Host: $host \r\n";
 $out .= "Connection: Close\r\n\r\n";
 $myInt = fwrite($fp, $out);
 while (! feof($fp))
 {
 $htmlString = $htmlString.fgets($fp);
 }
 fclose($fp);
 }
 return $htmlString;
 }
 function get_links($url)
 {
 $preg =
 "/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"
 ."([^<]+|.*?)?<\/a>/i";
 preg_match_all(trim($preg),
 get_all_html($url), $out, PREG_PATTERN_ORDER);
 $keys = $out[1];
 $values = $out[2];
 array_walk($values, 'remove_html_tags');
 return (array_combine($keys, $values));
 }
 //过滤关键字 不包含特定的字符串得到想要下载的链接
 function filter_string($var)
 {
 $pos = strpos($var, '中日研修');//只想要含中日研修的链接
 if ($pos === false)
 return false;
 else
 return true;
 }
 //保存到数据库
 function save_html($title, $html)
 {
 $link = mysql_connect("localhost:3306","root","")OR die(mysql_error());
 $res = mysql_select_db("test", $link);
 $result = mysql_query("insert into save_info(title, content) values ('$title', '$html')",$link);
 return $result;
 }
 $url = "http://www.zggjlww.cn/share/?mods=news&action=list&id=15";
 $arr = get_links($url);
 $newMatches = array_filter($arr,"filter_string");

 //提取a 中的 href内容合成完整的url, 连合标题保存到二维数组中
 //$strLinks = implode($newMatches,'');

 //print_r($newMatches);
 //foreach所操作的是指定数组的一个拷贝，而不是该数组本身。每次指向数组的第一位
 //foreach ($newMatches as $url => $site)
 //{
 // echo " 址址: $site, url: $url";
 //}
 /*
 * reset($newMatches);
 * while (list($key, $val) = each($newMatches))
 * {
 * echo "$key => $val\n";
 * }
 */

 reset($newMatches);
 $count = 1;
 while(list($key,$val) = each($newMatches))
 {
 $key = "http://" . get_host($url) . $key;
 $html = substr(htmlentities(get_all_html($key)), 2000, 10000);//我抓的页面太大了，截断了点了才能保存到mysql
 save_html($val,$html);
 $rest = count($newMatches) - $count;
 $count++ ;
 echo "$val = > $key 保存成功... 还有 $rest 张页面要处理...";
 }
?>

mysql数据库创建脚本
create table save_info
(
 id int auto_increment not null,
 title text,
 content longtext,
 primary key(id)
);

查看全文

相关阅读:
log4j 日志分级处理
 http接口调用，传递json格式带双引号问题
 测试输出方法执行时间
 关于 propertychange 兼容性问题
 表结构的修改
 固定table表头
 tomcat 的log4j配置问题
 ie 导出不行，不兼容问题，或只出现后缀文件无法识别
 Spring3.x错误----java.lang.ClassNotFoundException:org.aspectj.weaver.reflect.ReflectionWorld$ReflectionWorldException
Spring3.x错误----java.lang.ClassNotFoundException：org.aopalliance.inter.MethodInterceptor

原文地址：https://www.cnblogs.com/barrysgy/p/1766328.html