zoukankan      html  css  js  c++  java
  • php利用curl获取网页title内容

    /**$html = curl_get_file_contents($url);
    $title = get_title_contents($html);
    var_dump($title);*/
    function curl_get_file_contents($url,$referer='') {
    	static $curl_loops = 0;//避免死了循环必备
    	static $curl_max_loops = 3;
    	$useragent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36";
    
    	$ch = curl_init();
    	curl_setopt($ch,CURLOPT_URL,$url);
    	curl_setopt($ch,CURLOPT_HEADER,true);
    	curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); //不验证证书
    	curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); //不验证证书
    	curl_setopt($ch,CURLOPT_USERAGENT,$useragent);
    	curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
    	curl_setopt($ch,CURLOPT_REFERER,$referer);
    	$data = curl_exec($ch);
    	$ret = $data;
    	list($header,$data) = explode("
    
    ",$data,2);
    	$http_code = curl_getinfo($ch,CURLINFO_HTTP_CODE);
    	$last_url = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL);
    	curl_close($ch);
    	if ($http_code == 301 || $http_code == 302) {
    		$matches = array();
    		preg_match('/Location:(.*?)
    /',$header,$matches);
    		$url = @parse_url(trim(array_pop($matches)));
    		if (!$url) {
    			return $data;
    		}
    		$new_url = $url['scheme'] . '://' . $url['host'] . $url['path'] . (isset($url['query']) ? '?' . $url['query'] : '');
    		if ($curl_loops++ >= $curl_max_loops) {
    			return false;
    		}else {
    			$new_url = stripslashes($new_url);
    			return curl_get_file_contents($new_url);
    		}
    	} else {
    		list($header,$data) = explode("
    
    ",$ret,2);
    		return $data;
    	}
    }
    function get_title_contents($html){
    	// 解析 HTML 的 <head> 区段
    //	<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    //	<meta content="text/html; charset=gb2312" http-equiv="Content-Type">
    	preg_match("/<head.*>(.*)</head>/smUi",$html, $htmlHeaders);
    	//var_dump($output);die();
    	if(!count($htmlHeaders)){
    		$title = "无法解析数据中的 <head> 区段";
    	}
    
    // 取得 <head> 中 meta 设置的编码格式<meta charset="gb2312">
    	if(preg_match('/<meta.*charset=(("){0,1}[a-zA-Z0-9-]*("){0,1})/',$htmlHeaders[1], $results)){
    		$charset =  $results[1];
    	}else{
    		$charset = "None";
    	}
    	$charset = str_replace('"','',$charset);
    
    // 取得 <title> 中的文字
    	if(preg_match("/<title>(.*)</title>/Ui",$htmlHeaders[1], $htmlTitles)){
    		if(!count($htmlTitles)){
    			$title = "无法解析 <title> 的内容";
    			exit;
    		}
    
    		// 将  <title> 的文字编码格式转成 UTF-8
    		if($charset == "None"){
    			$title=$htmlTitles[1];
    		}else{
    			$title=iconv($charset, "UTF-8", $htmlTitles[1]);
    		}
    	}
    	return html_entity_decode($title);
    }
    

      支持https,302跳转

  • 相关阅读:
    Git单独checkout子目录
    GitHub上有很多不错的iOS开源项目
    iOS image caching. Libraries benchmark (SDWebImage vs FastImageCache)
    hosts etc css-js
    CFNetwork SSLHandshake failed (-9824) ios 9
    javascript基础---常见的语法
    Javascript 基础—变量 &运算符
    nodejs 版本dockerfile 文件制作,和常用命令
    pm2 的使用
    Nodejs 中常见的加密算法:RSA(1)
  • 原文地址:https://www.cnblogs.com/as3lib/p/6829208.html
Copyright © 2011-2022 走看看