php 多线程控制(说明:本文代码在查询百度排名时并不好使,仅供批评)
1 <html> 2 <head> 3 <meta http-equiv="content-type" content="text/html;charset=utf-8" /> 4 <title>百度关键词排名批量查询</title> 5 </head> 6 <body> 7 <h3>百度关键词排名批量查询</h3> 8 <form action="/baidu/index.php" method="post"> 9 输入关键词(每行一个)<br /> 10 <textarea name="keyword" style="resize:none;243px;height:70px;"></textarea> 11 <br />输入网址: 12 <input type="text" name="url" size="20" value="39.net" />(如:39.net 勿加http://)<br /> 13 <input type="submit" name="sub" value="查询" /> 14 </form> 15 <hr /> 16 </body> 17 </html> 18 <?php 19 if(isset($_POST['sub'])){ 20 $start_time = microtime_float(); 21 $kw = $_POST['keyword']; 22 $findurl = $_POST['url']; 23 $httpcurl = new CoreHttpCurl(); 24 $keywords = $httpcurl->get_keywords($kw); //查询的关键词数组 25 $urls = $httpcurl->get_urls($keywords); //百度搜索结果页面,array("关键词"=>"url",) 26 $ranks = $httpcurl->get($urls,10,$findurl); //关键词排名,array("关键词"=>"排名",) 27 $output = "<table border='1' bordercolor='green' cellspacing='0'><tr><th>关键词</th><th>排名</th></tr>"; 28 foreach($ranks as $keyword=>$rank){ 29 $output .= "<tr><td>{$keyword}</td><td>{$rank}</td></tr>"; 30 } 31 $output .= "</table>"; 32 echo $output; 33 34 $end_time = microtime_float(); 35 $con_time = $end_time - $start_time; 36 echo "查询耗时:".$con_time; 37 } 38 39 /** 40 * 计算耗时 41 **/ 42 function microtime_float(){ 43 list($usec,$sec) = explode(" ",microtime()); 44 return ((float)$usec+(float)$sec); 45 } 46 47 class CoreHttpCurl{ 48 protected $keywords = array(); //查询的关键词 49 protected $findurl = null; //查询的网站url 50 protected $urls = array(); //获取到的所有urls请求地址 51 52 protected $http_data = array(); //.... 53 protected $multi_exec_num = 10; //多列队任务进程数,0表示不限制 54 static protected $connecttimeout_ms = 3000; //默认连接超时时间 55 56 function __construct(){ 57 } 58 59 /** 60 *分析提交的关键词,并拆分成数组 61 **/ 62 public function get_keywords($keyword){ 63 $keyword = str_replace("\r\n","\n",$keyword); //换行符替换 64 $this->keywords = explode("\n",$keyword); //关键词数组 65 return $this->keywords; 66 } 67 68 /** 69 *获取请求的URL数组 70 @param array $keywords 71 @return array $urls key为关键词,value为对应的查询网址 72 **/ 73 public function get_urls($keywords){ 74 foreach($keywords as $word){ 75 $this->urls[$word] = "http://www.baidu.com/s?wd={$word}&cl=3&pn=0&rn=50"; 76 } 77 return $this->urls; 78 } 79 80 /** 81 *创建一个 CURL 对象 82 @param string $url 每个url请求地址 83 @param int $timeout 超时时间 84 @return curl_init() 85 **/ 86 protected function create_curl($url,$timeout){ 87 $ch = curl_init(); 88 curl_setopt($ch, CURLOPT_URL, $url); 89 curl_setopt($ch, CURLOPT_HEADER, true); 90 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); 91 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); 92 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); 93 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS,CoreHttpCurl::$connecttimeout_ms); 94 95 return $ch; 96 } 97 98 /** 99 *支持多线程获取网页 100 @param Array $urls 101 @param int $timeout 102 @return array() 103 **/ 104 function request_urls($urls,$timeout){ 105 $urls = array_unique($urls); // 去重 106 if(!$urls) return array(); // $urls不存在,直接返回空数组 107 $mh = curl_multi_init(); // cURL批处理句柄 108 109 $listener_list = array(); // 监听列表 110 $result = array(); // 返回的数据 111 $list_num = 0; // 总列队数 112 $multi_list = array(); // 排队列表 113 114 foreach($urls as $kw=>$url){ 115 $current = $this->create_curl($url,$timeout); // 创建一个curl对象 116 if($this->multi_exec_num > 0 && $list_num >= $this->multi_exec_num){ 117 $multi_list[] = $url; // 加入排队列表 118 }else{ 119 // 列队数控制 120 curl_multi_add_handle($mh, $current); 121 $listener_list[$kw] = $current; 122 $list_num++; 123 } 124 $result[$kw] = null; //与原文不同,这里使用关键词做键名 125 $this->http_data[$kw] = null; 126 } 127 unset($current); // 删除已加入队列的 128 $running = null; 129 130 $done_num = 0; // 已完成数 131 132 do{ 133 while(($execrun = curl_multi_exec($mh, $running)) == CURLM_CALL_MULTI_PERFORM); 134 if($execrun != CURLM_OK) break; 135 136 while(($done = curl_multi_info_read($mh)) == true){ 137 foreach ($listener_list as $done_kw=>$listener){ 138 if($listener === $done['handle']){ 139 //获取内容 140 $this->http_data[$done_kw] = $this->get_data(curl_multi_getcontent($done['handle']),$done['handle']); 141 142 if($this->http_data[$done_kw]['code'] != 200){ 143 $result[$done_kw] = false; 144 }else{ 145 // 返回内容 146 $result[$done_kw] = $this->http_data[$done_kw]['data']; 147 } 148 149 curl_close($done['handle']); //关闭已经处理完的 curl 会话 150 curl_multi_remove_handle($mh, $done['handle']); //从 $mh 中移除 151 unset($listener_list[$done_kw],$listener); //从监听列表中移除 152 $done_num++; 153 154 //如果还有排队列表,则继续加入 155 if($multi_list){ 156 $current_url = array_shift($multi_list); // 获取队列中的第一条url 157 $current = $this->create_curl($current_url, $timeout); // 创建 curl 对象 158 curl_multi_add_handle($mh, $current); // 加入到队列中 159 160 $listen_list[$current_url] = $current; // 更新监听队列信息 161 unset($current); 162 163 $list_num++; //更新队列数 164 } 165 break; 166 } 167 } 168 } 169 if($done_num >= $list_num) break; 170 }while(true); 171 curl_multi_close($mh); //关闭列队 172 return $result; 173 } 174 175 /** 176 * GET方式获取数据,支持多个URL 177 **/ 178 public function get($urls, $timeout=10,$findurl){ 179 $data = $this->request_urls($urls, $timeout); 180 //$this->clear_set(); 181 $ranks = $this->baid_rank($data, $findurl); //查询排名 182 return $ranks; 183 } 184 185 /** 186 * 获取内容的函数 187 */ 188 protected function get_data($data,$ch){ 189 $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); 190 $result['code'] = curl_getinfo($ch, CURLINFO_HTTP_CODE); 191 $result['data'] = substr($data, $header_size); 192 $result['header'] = explode("\r\n", substr($data, 0, $header_size)); 193 $result['time'] = curl_getinfo($ch, CURLINFO_TOTAL_TIME); 194 return $result; 195 } 196 197 /** 198 * 排名查询 199 @param array $serp 搜索结果返回数据(key为关键词,value为页面源代码) 200 @param string $findurl 查询关键词排名的网站URL,如39.net,勿加http:// 201 @return array 202 **/ 203 protected function baid_rank(array $serp, $findurl){ 204 $ranks = array(); 205 foreach($serp as $keyword=>$source){ 206 $pattern = "#<span class=\"g\">.*<\/span>#U"; 207 preg_match_all($pattern, $source, $m); 208 if(!strpos(implode($m[0]),$findurl)){ 209 $ranks[$keyword] = 0; 210 }else{ 211 foreach($m[0] as $k=>$v){ 212 if(strpos($v, $findurl)){ 213 $ranks[$keyword] = $k+1; 214 break; 215 } 216 } 217 } 218 } 219 return $ranks; 220 } 221 222 /** 223 *清理设置 224 **/ 225 } 226 227 ?>
上面似乎不好用,下面稍微好一点:
<html> <head> <meta http-equiv="content-type" content="text/html;charset=utf-8" /> <title>百度关键词排名批量查询</title> </head> <body> <h3>百度关键词排名批量查询</h3> <form action="test5.php" method="post"> 输入关键词(每行一个)<br /> <textarea name="keyword" style="resize:none;243px;height:70px;"></textarea> <br />输入网址: <input type="text" name="url" size="20" value="39.net" />(如:39.net 勿加http://)<br /> <input type="submit" name="sub" value="查询" /> </form> <hr /> </body> </html> <?php /** * Wget Curl驱动核心 * * @author jonwang(jonwang@myqee.com) * @category MyQEE * @package System * @subpackage Core * @copyright Copyright (c) 2008-2012 myqee.com * @license http://www.myqee.com/license.html */ set_time_limit(0); if(isset($_POST['sub'])){ $start_time = microtime_float(); $kw = $_POST['keyword']; $findurl = $_POST['url']; $httpcurl = new Core_HttpClient_Driver_Curl(); $keywords = $httpcurl->get_keywords($kw);//获取关键词数组 $urls = $httpcurl->get_urls($keywords);//获取请求的url array("关键词"=>"搜索url") $data = $httpcurl->get($urls);//获取搜索结果页面的源代码 array("关键词"=>"网页内容") $ranks = $httpcurl->get_rank($data, $findurl); //获取排名 $end_time = microtime_float(); $con_time = $end_time - $start_time; echo "查询耗时:".$con_time; $output = "<table border='1' bordercolor='green' cellspacing='0'><tr><th>序号</th><th>关键词</th><th>排名</th></tr>"; $i=1; foreach($ranks as $keyword=>$rank){ $output .= "<tr><td>{$i}</td><td>{$keyword}</td><td>{$rank}</td></tr>"; $i++; } $output .= "</table>"; echo $output; } /** * 计算耗时 **/ function microtime_float(){ list($usec,$sec) = explode(" ",microtime()); return ((float)$usec+(float)$sec); } class Core_HttpClient_Driver_Curl{ protected $http_data = array(); protected $agent; protected $cookies; protected $referer; protected $ip; protected $header = array(); protected $_option = array(); protected $_post_data = array(); protected $keywords = array(); //提交的关键词数组 protected $urls = array(); //百度查询页面URL /** * 多列队任务进程数,0表示不限制 * 采集百度,太大会被封,伪装来路和ip似乎也没有用;太小耗时间 * @var int */ protected $multi_exec_num = 3; /** * 默认连接超时时间,毫秒 * * @var int */ protected static $connecttimeout_ms = 3000; const ERROR_HOST = '请求的URL错误'; const ERROR_GET = 'GET请求错误'; const ERROR_POST = 'POST请求错误'; function __construct(){ } /** * 设置$cookie * * @param $agent * @return HttpClient_Driver_Curl */ public function set_agent($agent) { $this->agent = $agent; return $this; } /** * 设置$cookie * * @param string $cookie * @return HttpClient_Driver_Curl */ public function set_cookies($cookies) { $this->cookies = $cookies; return $this; } /** * 设置$referer * * @param string $referer * @return HttpClient_Driver_Curl */ public function set_referer($referer) { $this->referer = $referer; return $this; } /** * 设置IP * * @param string $ip * @return HttpClient_Driver_Curl */ public function set_ip($ip) { $this->ip = $ip; return $this; } /** * 设置curl参数 * * @param string $key * @param value $value * @return HttpClient_Driver_Curl */ public function set_option($key, $value) { if ( $key===CURLOPT_HTTPHEADER ) { $this->header = array_merge($this->header,$value); } else { $this->_option[$key] = $value; } return $this; } /** * 设置多个列队默认排队数上限 * * @param int $num * @return HttpClient_Driver_Curl */ public function set_multi_max_num($num=0) { $this->multi_exec_num = (int)$num; return $this; } /** * 用POST方式提交,支持多个URL * * $urls = array * ( * 'http://www.baidu.com/', * 'http://mytest.com/url', * 'http://www.abc.com/post', * ); * $data = array * ( * array('k1'=>'v1','k2'=>'v2'), * array('a'=>1,'b'=>2), * 'aa=1&bb=3&cc=3', * ); * HttpClient::factory()->post($url,$data); * * @param $url * @param string/array $vars * @param $timeout 超时时间,默认120秒 * @return string, false on failure */ public function post($url, $vars, $timeout = 60) { # POST模式 $this->set_option( CURLOPT_HTTPHEADER, array('Expect:') ); $this->set_option( CURLOPT_POST, true ); if (is_array($url)) { $myvars = array(); foreach ($url as $k=>$url) { if (isset($vars[$k])) { if (is_array($vars[$k])) { $myvars[$url] = http_build_query($vars[$k]); } else { $myvars[$url] = $vars[$k]; } } } } else { $myvars = array($url=>$vars); } $this->_post_data = $myvars; return $this->get($url,$timeout); } /** * GET方式获取数据,支持多个URL * * @param string/array $url * @param $timeout * @return string, false on failure */ public function get($url, $timeout = 10) { if ( is_array($url) ) { $getone = false; $urls = $url; } else { $getone = true; $urls = array($url);//单个url,也转为数组 } $data = $this->request_urls($urls, $timeout); $this->clear_set(); return $data; /* if ( $getone ){ $this->http_data = $this->http_data[$done_kw]; return $data[$done_kw]; } else{ return $data; } */ } /** * 创建一个CURL对象 * * @param string $url URL地址 * @param int $timeout 超时时间 * @return curl_init() */ protected function _create($url,$timeout) { if ( false===strpos($url, '://') ) { preg_match('#^(http(?:s)?\://[^/]+/)#', $_SERVER["SCRIPT_URI"] , $m); $the_url = $m[1].ltrim($url,'/'); } else { $the_url = $url; } /* if ($this->ip) { # 如果设置了IP,则把URL替换,然后设置Host的头即可 if ( preg_match('#^(http(?:s)?)\://([^/\:]+)(\:[0-9]+)?/#', $the_url.'/',$m) ) { $this->header[] = 'Host: '.$m[2]; $the_url = $m[1].'://'.$this->ip.$m[3].'/'.substr($the_url,strlen($m[0])); } $this->header['Client-IP'] = $this->ip; } */ $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $the_url); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, Core_HttpClient_Driver_Curl::$connecttimeout_ms); if ( preg_match('#^https://#i', $the_url) ) { curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); } if ( $this->cookies ) { curl_setopt($ch, CURLOPT_COOKIE, http_build_query($this->cookies, '', ';')); } $this->refer = "http://www.bd".mt_rand(1,9999).".com/"; if ( $this->referer ) { curl_setopt($ch, CURLOPT_REFERER, $this->referer); } $this->agent = "Mozilla/".mt_rand(1,100)." (Windows NT 6.1; rv:18.0) Gecko/20100101 Firefox/18.0"; if ( $this->agent ) { curl_setopt($ch, CURLOPT_USERAGENT, $this->agent); } elseif ( array_key_exists('HTTP_USER_AGENT', $_SERVER) ) { curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']); } foreach ( $this->_option as $k => $v ) { curl_setopt($ch, $k, $v); } /* $this->ip = mt_rand(10,100).".".mt_rand(10,200).".".mt_rand(10,200).".".mt_rand(1,200); $this->header['CLIENT-IP'] = $this->ip; $this->header['X-FORWARDER-FOR'] = $this->ip; */ if ( $this->header ) { $header = array(); foreach ($this->header as $item) { # 防止有重复的header if (preg_match('#(^[^:]*):.*$#', $item,$m)) { $header[$m[1]] = $item; } } curl_setopt($ch, CURLOPT_HTTPHEADER, array_values($header)); } # 设置POST数据 if (isset($this->_post_data[$url])) { curl_setopt($ch , CURLOPT_POSTFIELDS , $this->_post_data[$url]); } return $ch; } /** * 支持多线程获取网页 * * @see http://cn.php.net/manual/en/function.curl-multi-exec.php#88453 * @param Array/string $urls * @param Int $timeout * @return Array */ protected function request_urls($urls, $timeout = 10) { # 去重 $urls = array_unique($urls); if (!$urls)return array(); $mh = curl_multi_init(); # 监听列表 $listener_list = array(); # 返回值 $result = array(); # 总列队数 $list_num = 0; # 排队列表 $multi_list = array(); foreach ( $urls as $kw=>$url ) { # 创建一个curl对象 $current = $this->_create($url, $timeout); if ( $this->multi_exec_num>0 && $list_num>=$this->multi_exec_num ) { # 加入排队列表 $multi_list[$kw] = $url; } else { # 列队数控制 curl_multi_add_handle($mh, $current); $listener_list[$kw] = $current; $list_num++; } $result[$kw] = null; $this->http_data[$kw] = null; } unset($current); $running = null; # 已完成数 $done_num = 0; do { while ( ($execrun = curl_multi_exec($mh, $running)) == CURLM_CALL_MULTI_PERFORM ); if ( $execrun != CURLM_OK ) break; while ( true==($done = curl_multi_info_read($mh)) ) { foreach ( $listener_list as $done_kw=>$listener ) { if ( $listener === $done['handle'] ){ # 获取内容 $this->http_data[$done_kw] = $this->get_data(curl_multi_getcontent($done['handle']), $done['handle']); if ( $this->http_data[$done_kw]['code'] != 200 ){ //Core::debug()->error('URL:'.$done_url.' ERROR,TIME:' . $this->http_data[$done_url]['time'] . ',CODE:' . $this->http_data[$done_url]['code'] ); $result[$done_kw] = false; } else{ # 返回内容 $result[$done_kw] = $this->http_data[$done_kw]['data']; //Core::debug()->info('URL:'.$done_url.' OK.TIME:' . $this->http_data[$done_url]['time'] ); } curl_close($done['handle']); curl_multi_remove_handle($mh, $done['handle']); # 把监听列表里移除 unset($listener_list[$done_kw],$listener); $done_num++; # 如果还有排队列表,则继续加入 if ( $multi_list ){ # 获取列队中的一条URL $kw = array_keys($multi_list)[0]; $current_url = array_shift($multi_list); # 创建CURL对象 $current = $this->_create($current_url, $timeout); # 加入到列队 curl_multi_add_handle($mh, $current); # 更新监听列队信息 $listener_list[$kw] = $current; unset($current); # 更新列队数 $list_num++; } break; } } } if ($done_num>=$list_num)break; } while (true); # 关闭列队 curl_multi_close($mh); return $result; } public function get_resut_data() { return $this->http_data; } protected function get_data($data, $ch) { $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); $result['code'] = curl_getinfo($ch, CURLINFO_HTTP_CODE); $result['data'] = substr($data, $header_size); $result['header'] = explode("\r\n", substr($data, 0, $header_size)); $result['time'] = curl_getinfo($ch, CURLINFO_TOTAL_TIME); return $result; } /** * 清理设置 */ protected function clear_set() { $this->_option = array(); $this->header = array(); $this->ip = null; $this->cookies = null; $this->referer = null; $this->_post_data = array(); } /** *分析提交的关键词,并拆分成数组 **/ public function get_keywords($keyword){ $keyword = str_replace("\r\n","\n",$keyword); //换行符替换 $this->keywords = explode("\n",$keyword); //关键词数组 return $this->keywords; } /** *获取请求的URL数组 @param array $keywords @return array $urls key为关键词,value为对应的查询网址 **/ public function get_urls($keywords){ foreach($keywords as $word){ $this->urls[$word] = "http://www.baidu.com/s?wd={$word}&cl=3&pn=0&rn=50"; } return $this->urls; } /** * 排名查询 @param array $serp 搜索结果返回数据(key为关键词,value为页面源代码) @param string $findurl 查询关键词排名的网站URL,如39.net,勿加http:// @return array **/ public function get_rank(array $serp, $findurl){ $ranks = array(); foreach($serp as $keyword=>$source){ $pattern = "#<span class=\"g\">.*<\/span>#U"; preg_match_all($pattern, $source, $m); if(!strpos(implode($m[0]),$findurl)){ $ranks[$keyword] = 0; }else{ foreach($m[0] as $k=>$v){ if(strpos($v, $findurl)){ $ranks[$keyword] = $k+1; break; } } } } return $ranks; } }
防止被百度封IP的另一个思路:搜索请求url改成ip,如:http://www.baidu.com/改成http://115.239.210.26/,百度ip很多,全部整理出来,做轮换,下回尝试是否可行(curl类中刚好有域名换成ip的代码)。