一共涉及以下几个文件:
data_article_gather.php//显示出要获取哪个网页的文章列表

<?php require_once '../include/adminfunction.php'; date_default_timezone_set('PRC'); checkadmin();//sysfunction.php里面的方法,验证是否已经登录 ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>data_article_manage</title> <link rel="stylesheet" href="css/admin_center.css" type="text/css" /> <script> var highlightcolor='#eafcd5'; //此处clickcolor只能用win系统颜色代码才能成功 var clickcolor='#51b2f6'; function changeto(){ source=event.srcElement; if (source.tagName=="TR"||source.tagName=="TABLE") return; while(source.tagName!="TD") source=source.parentElement; source=source.parentElement; cs = source.children; //alert(cs.length); if (cs[1].style.backgroundColor!=highlightcolor&&source.id!="nc"&&cs[1].style.backgroundColor!=clickcolor) for(i=0;i<cs.length;i++){ cs[i].style.backgroundColor=highlightcolor; } } function changeback(){ if (event.fromElement.contains(event.toElement)||source.contains(event.toElement)||source.id=="nc") return if (event.toElement!=source&&cs[1].style.backgroundColor!=clickcolor) //source.style.backgroundColor=originalcolor for(i=0;i<cs.length;i++){ cs[i].style.backgroundColor=""; } } function clickto(){ source=event.srcElement; if (source.tagName=="TR"||source.tagName=="TABLE") return; while(source.tagName!="TD") source=source.parentElement; source=source.parentElement; cs = source.children; //alert(cs.length); if (cs[1].style.backgroundColor!=clickcolor&&source.id!="nc") for(i=0;i<cs.length;i++){ cs[i].style.backgroundColor=clickcolor; } else for(i=0;i<cs.length;i++){ cs[i].style.backgroundColor=""; } } </script> </head> <body> <table width="100%" border="0" align="center" cellpadding="0" cellspacing="0"> <tr> <td height="30"> <table width="100%" border="0" cellspacing="0" cellpadding="0"> <tr> <td width="15" height="30"><img src="images/main_01.gif" width="15" height="30" /></td> <td width="1101" background="images/main_02.gif"><img src="images/center_ico01.gif" width="16" height="16" /> <span class="STYLE1">文章管理</span></td> <td width="281" background="images/main_02.gif"> <table border="0" align="right" cellpadding="0" cellspacing="0"> </table> </td> <td width="14"><img src="images/main_03.gif" width="14" height="30" /></td> </tr> </table></td> </tr> </table> </td> </tr> <tr> <td> <table width="100%" border="0" cellspacing="0" cellpadding="0"> <tr> <td width="9" background="images/main_04.gif"> </td> <td bgcolor="#f3ffe3"><table width="99%" border="0" align="center" cellpadding="0" cellspacing="1" bgcolor="#c0de98" onmouseover="changeto()" onmouseout="changeback()"> <tr> <td width="50" height="26" background="images/main_05.gif" class="STYLE2">编号</td> <td width="450" height="26" background="images/main_05.gif" class="STYLE2">取文章链接的地址</td> <td width="200" background="images/main_05.gif" class="STYLE2">网站说明</td> <td height="26" background="images/main_05.gif" class="STYLE2">操作</td> </tr> <tr> <td width="50" height="30" bgcolor="#FFFFFF" class="STYLE2"> 1 </td> <td width="450" height="30" bgcolor="#FFFFFF" class="STYLE2"> <a href="http://news.ef360.com/lady/">http://news.ef360.com/lady/</a> </td> <td width="200" height="30" bgcolor="#FFFFFF" class="STYLE2"> 华衣网 女装资讯 </td> <td height="30" bgcolor="#FFFFFF" class="STYLE5"> <a href="data_article_gather_num.php?id=1" target="centerFrame">[开始获取内容]</a> </td> </tr> <tr> <td width="50" height="30" bgcolor="#FFFFFF" class="STYLE2"> 2 </td> <td width="450" height="30" bgcolor="#FFFFFF" class="STYLE2"> <a href="http://www.chaoliu1.net/fushi/nvshi/">http://www.chaoliu1.net/fushi/nvshi/</a> </td> <td width="200" height="30" bgcolor="#FFFFFF" class="STYLE2"> 第一潮流网 潮流服饰 女式服装 </td> <td height="30" bgcolor="#FFFFFF" class="STYLE5"> <a href="data_article_gather_num.php?id=2" target="centerFrame">[开始获取内容]</a> </td> </tr> <tr> <td width="50" height="30" bgcolor="#FFFFFF" class="STYLE2"> 3 </td> <td width="450" height="30" bgcolor="#FFFFFF" class="STYLE2"> <a href="http://www.nz86.com/popular/">http://www.nz86.com/popular/</a> </td> <td width="200" height="30" bgcolor="#FFFFFF" class="STYLE2"> 中国女装网 时尚快递 潮流搭配 </td> <td height="30" bgcolor="#FFFFFF" class="STYLE5"> <a href="data_article_gather_num.php?id=3" target="centerFrame">[开始获取内容]</a> </td> </tr> </table></td> <td width="9" background="images/main_06.gif"></td> </tr> </table></td> </tr> <tr> <td height="29"><table width="100%" border="0" cellspacing="0" cellpadding="0"> <tr> <td width="15" height="29"><img src="images/main_07.gif" width="15" height="29" /></td> <td width="100%" background="images/main_08.gif" style="padding-left:150px;"> </td> </tr> </table></td> </tr> </table> </body> </html>
data_article_gather_num.php//将某个网页链接上的全部文章列表获取到并显示

<?php require_once '../include/adminfunction.php'; date_default_timezone_set('PRC'); checkadmin();//sysfunction.php里面的方法,验证是否已经登录 set_time_limit(0); function canshujiequ($yuanma,$canshustr,$mubiao){ if($yuanma=='')return array(); if(strpos($canshustr,'[参数]')==false||strpos($mubiao,'[参数1]')==false) { echo '参数或组合字符串格式不对'; return array(); } $chaxunwz=0; $canshuarr=array(); $canshuarr=explode('[参数]',$canshustr); $len1=count($canshuarr); $pipeiarr=array(); $tpfarr=array(); $qianks=0; $qianjs=0; $nowks=0; $nowjs=0; $end=0; $num=0; while(($end==0)&&($chaxunwz<strlen($yuanma))){ $mubiaofuben=$mubiao; $feikong=0; for($i=0;($end==0)&&($i<$len1);$i++){ if($canshuarr[$i]=='')continue; $feikong++; $tpfarr=explode('(*)',$canshuarr[$i]); $len2=count($tpfarr); $feikongnum=0; for($j=0;($j<$len2)&&($end==0);$j++){ if($tpfarr[$j]=='')continue; $feikongnum++; if($chaxunwz>=strlen($yuanma)){$end=1;break;} if(($pipeiwz=strpos($yuanma,$tpfarr[$j],$chaxunwz))!==false){ $chaxunwz=$pipeiwz+strlen($tpfarr[$j]); if($feikongnum==1)$nowks=$pipeiwz; $nowjs=$chaxunwz; } else{$end=1;break;} } if($end==0){ if($feikong>1){ $str=substr($yuanma,$qianjs,$nowks-$qianjs); $mubiaofuben=str_replace('[参数'.($feikong-1).']',$str,$mubiaofuben); } $qianks=$nowks; $qianjs=$nowjs; }else{ break; } } if($end==0){ $pipeiarr[]=$mubiaofuben; $num++; } } return $pipeiarr; } $jieguo1=""; $jieguo2=""; $list_href=array(); $list_name=array(); $sel=$_GET["id"]; if($_GET["id"]==1) { $source=file_get_contents("http://news.ef360.com/lady/");//获取数据源(【url】) $a='<ul class="ul_text_1 f14 arr1" style="padding:15px 0;">[参数]</ul>'; $b=" [参数1] "; $jieguo1=canshujiequ($source,$a,$b); for($i=0;$i<count($jieguo1);$i++) { $source=iconv("GB2312","UTF-8//IGNORE",$jieguo1[$i]) ; $a='http://news.ef360.com/Articles/[参数].html'; $b="http://news.ef360.com/Articles/[参数1].html"; $list=canshujiequ($source,$a,$b); for($m=0;$m<count($list);$m++) { $list_href[]=$list[$m]; } $source=iconv("GB2312","UTF-8//IGNORE",$jieguo1[$i]) ; $c='_blank">[参数]</a></li>'; $d=" [参数1] "; $list=canshujiequ($source,$c,$d); for($m=0;$m<count($list);$m++) { $list_name[]=$list[$m]; } } } else if($_GET["id"]==2) { $source=file_get_contents("http://www.chaoliu1.net/fushi/nvshi/");//获取数据源(【url】) $source=iconv("GB2312","UTF-8//IGNORE",$source) ; $a='href="http://www.chaoliu1.net/fushi/nvshi/[参数].html'; $b="http://www.chaoliu1.net/fushi/nvshi/[参数1].html"; $list_href=canshujiequ($source,$a,$b); $a='" class="title">[参数]</A>'; $b=" [参数1] "; $list_name=canshujiequ($source,$a,$b); } else if($_GET["id"]==3) { $source=file_get_contents("http://eladies.sina.com.cn/fa/zhuangban/");//获取数据源(【url】) $a='<span class="l"> <a href="http://www.nz86.com/article/[参数]/" target="_blank" title="小外套度暖春 轻松显瘦">小外套度暖春 轻松显瘦</a> </span>'; $b="http://eladies.sina.com.cn/fa/2013/[参数1].shtml"; $c='.shtml" target="_blank" title="[参数]"><img src="'; $d=" [参数1] "; $jieguo1=canshujiequ($source,$a,$b); $jieguo2=canshujiequ($source,$c,$d); } else { echo "未发现有此链接"; } ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>data_article_manage</title> <link rel="stylesheet" href="css/admin_center.css" type="text/css" /> <script> var highlightcolor='#eafcd5'; //此处clickcolor只能用win系统颜色代码才能成功 var clickcolor='#51b2f6'; function changeto(){ source=event.srcElement; if (source.tagName=="TR"||source.tagName=="TABLE") return; while(source.tagName!="TD") source=source.parentElement; source=source.parentElement; cs = source.children; //alert(cs.length); if (cs[1].style.backgroundColor!=highlightcolor&&source.id!="nc"&&cs[1].style.backgroundColor!=clickcolor) for(i=0;i<cs.length;i++){ cs[i].style.backgroundColor=highlightcolor; } } function changeback(){ if (event.fromElement.contains(event.toElement)||source.contains(event.toElement)||source.id=="nc") return if (event.toElement!=source&&cs[1].style.backgroundColor!=clickcolor) //source.style.backgroundColor=originalcolor for(i=0;i<cs.length;i++){ cs[i].style.backgroundColor=""; } } function clickto(){ source=event.srcElement; if (source.tagName=="TR"||source.tagName=="TABLE") return; while(source.tagName!="TD") source=source.parentElement; source=source.parentElement; cs = source.children; //alert(cs.length); if (cs[1].style.backgroundColor!=clickcolor&&source.id!="nc") for(i=0;i<cs.length;i++){ cs[i].style.backgroundColor=clickcolor; } else for(i=0;i<cs.length;i++){ cs[i].style.backgroundColor=""; } } </script> <script> function SelectAll() { var checkboxs=document.getElementsByName("checkboxid[]"); var checkboxs_l=document.getElementsByName("checkboxid_a"); var a=checkboxs_l[0]; for (var i=0;i<checkboxs.length;i++) { var e=checkboxs[i]; if(a.checked) { e.checked=true;}else{e.checked=false;} } } </script> </head> <body> <table width="100%" border="0" align="center" cellpadding="0" cellspacing="0"> <tr> <td height="30"> <table width="100%" border="0" cellspacing="0" cellpadding="0"> <tr> <td width="15" height="30"><img src="images/main_01.gif" width="15" height="30" /></td> <td width="1101" background="images/main_02.gif"><img src="images/center_ico01.gif" width="16" height="16" /> <span class="STYLE1">文章管理</span></td> <td width="281" background="images/main_02.gif"> <table border="0" align="right" cellpadding="0" cellspacing="0"> </table> </td> <td width="14"><img src="images/main_03.gif" width="14" height="30" /></td> </tr> </table></td> </tr> </table> </td> </tr> <tr> <td> <table width="100%" border="0" cellspacing="0" cellpadding="0"> <tr> <td width="9" background="images/main_04.gif"> </td> <td bgcolor="#f3ffe3"><table width="99%" border="0" align="center" cellpadding="0" cellspacing="1" bgcolor="#c0de98" onmouseover="changeto()" onmouseout="changeback()"> <tr> <td width="50" height="26" background="images/main_05.gif" class="STYLE2"> <input name="checkboxid_a" type="checkbox" value="" onclick="SelectAll()" /> </td> <td width="450" height="26" background="images/main_05.gif" class="STYLE2">取文章链接的地址</td> <td width="463" background="images/main_05.gif" class="STYLE2">网站说明</td> <td width="159" height="26" background="images/main_05.gif" class="STYLE2">操作</td> </tr> <form action="data_article_gather_born.php?id=<?php echo $_GET["id"];?>" name="form1" method="post"> <?php for($n=0;$n<count($list_href);$n++){?> <tr> <td width="50" height="30" bgcolor="#FFFFFF" class="STYLE2"> <input type="checkbox" name="checkboxid[]" id="<?php echo $n;?>"value="<?php echo $list_href[$n];?>" /> </td> <td width="450" height="30" bgcolor="#FFFFFF" class="STYLE2"> <a href="<?php echo $list_href[$n];?>"><?php echo $list_href[$n];?></a> </td> <td width="463" height="30" bgcolor="#FFFFFF" class="STYLE2"> <?php echo $list_name[$n];?> </td> <td height="30" bgcolor="#FFFFFF" class="STYLE5"> <a href="#">[生成网站数据]</a> </td> </tr> <?php }?> </form> </table></td> <td width="9" background="images/main_06.gif"></td> </tr> </table></td> </tr> <tr> <td height="29"><table width="100%" border="0" cellspacing="0" cellpadding="0"> <tr> <td width="15" height="29"><img src="images/main_07.gif" width="15" height="29" /></td> <td width="100%" background="images/main_08.gif" style="padding-left:150px;"> <input name="" type="button" value="开始生成" onclick="form1.submit()"/> </td> </tr> </table></td> </tr> </table> </body> </html>
data_article_gather_born.php//将获取到的内容填到数据库中

<?php require_once '../include/adminfunction.php'; date_default_timezone_set('PRC'); checkadmin();//sysfunction.php里面的方法,验证是否已经登录 function canshujiequ($yuanma,$canshustr,$mubiao){ if($yuanma=='')return array(); if(strpos($canshustr,'[参数]')==false||strpos($mubiao,'[参数1]')==false) { echo '参数或组合字符串格式不对'; return array(); } $chaxunwz=0; $canshuarr=array(); $canshuarr=explode('[参数]',$canshustr); $len1=count($canshuarr); $pipeiarr=array(); $tpfarr=array(); $qianks=0; $qianjs=0; $nowks=0; $nowjs=0; $end=0; $num=0; while(($end==0)&&($chaxunwz<strlen($yuanma))){ $mubiaofuben=$mubiao; $feikong=0; for($i=0;($end==0)&&($i<$len1);$i++){ if($canshuarr[$i]=='')continue; $feikong++; $tpfarr=explode('(*)',$canshuarr[$i]); $len2=count($tpfarr); $feikongnum=0; for($j=0;($j<$len2)&&($end==0);$j++){ if($tpfarr[$j]=='')continue; $feikongnum++; if($chaxunwz>=strlen($yuanma)){$end=1;break;} if(($pipeiwz=strpos($yuanma,$tpfarr[$j],$chaxunwz))!==false){ $chaxunwz=$pipeiwz+strlen($tpfarr[$j]); if($feikongnum==1)$nowks=$pipeiwz; $nowjs=$chaxunwz; } else{$end=1;break;} } if($end==0){ if($feikong>1){ $str=substr($yuanma,$qianjs,$nowks-$qianjs); $mubiaofuben=str_replace('[参数'.($feikong-1).']',$str,$mubiaofuben); } $qianks=$nowks; $qianjs=$nowjs; }else{ break; } } if($end==0){ $pipeiarr[]=$mubiaofuben; $num++; } } return $pipeiarr; } function GrabImage($url, $filename=""){ //$url 为空则返回 false; if($url == ""){return false;} $ext = strrchr($url, ".");//得到图片的扩展名 if($ext != ".gif" && $ext != ".jpg" && $ext != ".bmp"){echo "格式不支持!";return false;} if($filename == ""){$filename = time()."$ext";}//以时间戳另起名 //开始捕捉 ob_start(); readfile($url); $img = ob_get_contents(); ob_end_clean(); $size = strlen($img); $fp2 = fopen($filename , "a"); fwrite($fp2, $img); fclose($fp2); return $filename; } if($_GET["id"]==1) { if(!empty($_POST["checkboxid"])) { $url_list=$_POST["checkboxid"]; for($i=0;$i<count($_POST["checkboxid"]);$i++) { $url = $url_list[$i]; $contents=file_get_contents($url); $contents=str_replace("/EditManager/File/News/","http://news.ef360.com/EditManager/File/News/",iconv("GBK", "UTF-8//IGNORE", $contents)); /*`Article_id` `Article_name``Last_time` `Article_source``Article_desc``Article_content``image` `Image_url` `Category_id` */ //1.标题2.时间3.来源4.备注5.文章6.分类 $c='<h1 class="news_title">[参数]</h1>'; $d=" [参数1] "; $title=canshujiequ($contents,$c,$d);//print_r($title); $a='<span id="btn_message" class="btn_message"></span>[参数]</div>'; $b=" [参数1] "; $source=canshujiequ($contents,$a,$b);//print_r($source); $a='<div class="content">[参数]</div> <div class="tagbar"> '; $b=" [参数1] "; $content=canshujiequ($contents,$a,$b); $shijian=time(); $sql="INSERT INTO `women`.`article` (`Article_id` ,`Article_name` ,`Last_time` ,`Article_source` ,`Article_desc` ,`Article_content` ,`image` ,`Image_url` ,`Category_id`) VALUES (NULL , '".$title[0]."', '".$shijian."', '".$source[0]."', '0', '".$content[0]."', '', '', '3')"; print_r($content); echo "ok"; if(mysql_query($sql)) { echo "<script>alert('数据生成成功,您可以在文章资讯列表中查看');</script>"; } else{echo "<script>alert('数据生成失败,可能是此篇文章已经存在');</script>";} } } } else if($_GET["id"]==2) { if(!empty($_POST["checkboxid"])) { $url_list=$_POST["checkboxid"]; for($i=0;$i<count($_POST["checkboxid"]);$i++) { $url = $url_list[$i]; print_r($url); $contents=file_get_contents($url); $contents=iconv("GB2312","UTF-8//IGNORE",$contents); /*`Article_id` `Article_name``Last_time` `Article_source``Article_desc``Article_content``image` `Image_url` `Category_id` */ //1.标题2.时间3.来源4.备注5.文章6.分类 $c='<H2><STRONG>[参数]</STRONG></H2>'; $d=" [参数1] "; $title=canshujiequ($contents,$c,$d);//print_r($title); $a='<P class=title-bt>[参数] <SPAN>-</SPAN>点击'; $b=" [参数1] "; $source=canshujiequ($contents,$a,$b);//print_r($source); $a='<script src="http://cpro.baidustatic.com/cpro/ui/c.js" type="text/javascript"></script> [参数]<DIV class=pg>'; $b=" [参数1] "; $content=canshujiequ($contents,$a,$b); $shijian=time(); print_r($title[0]);echo "<br/>"; print_r($source[0]);echo "<br/>"; print_r(htmlspecialchars($content[0]));echo "<br/>"; $sql="INSERT INTO `women`.`article` (`Article_id` ,`Article_name` ,`Last_time` ,`Article_source` ,`Article_desc` ,`Article_content` ,`image` ,`Image_url` ,`Category_id`) VALUES (NULL , '".$title[0]."', '".$shijian."', '".$source[0]."', '0', '".str_replace("'",'"',$content[0])."', '', '', '3')"; print_r($content); echo "ok"; if(mysql_query($sql)) { echo "<script>alert('数据生成成功,您可以在文章资讯列表中查看');</script>"; } else{echo "<script>alert('数据生成失败,可能是此篇文章已经存在');</script>";} } } } ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>无标题文档</title> </head> <body> </body> </html>
文章表数据结构:
