项目下载地址:http://code.google.com/p/phpquery/
获取内容的方法:
第一种:newDocumentFile
phpQuery::newDocumentFile($url);
第二种:
$content = file_get_contents($url); $htmlObj = phpQuery::newDocumentHTML($content);
获取网页内容:
第一种:获取html节点
pq('title')->html()
第二种:获取script内容(会分数组)
pq("script")->getString();
突破防爬虫
1 function _get_fake_apider($url) { 2 $ch = curl_init(); 3 $ip = '115.239.211.112'; //百度蜘蛛 4 $timeout = 15; 5 curl_setopt($ch,CURLOPT_URL,$url); 6 curl_setopt($ch,CURLOPT_TIMEOUT, $timeout); 7 //伪造百度蜘蛛IP 8 curl_setopt($ch,CURLOPT_HTTPHEADER,array('X-FORWARDED-FOR:'.$ip.'','CLIENT-IP:'.$ip.'')); 9 //伪造百度蜘蛛头部 10 curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"); 11 curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); 12 curl_setopt($ch,CURLOPT_HEADER,0); 13 curl_setopt ($ch, CURLOPT_REFERER, "http://www.baidu.com/ "); //构造来路 14 curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout); 15 $content = curl_exec($ch); 16 return $content; 17 }