zoukankan      html  css  js  c++  java
  • 自己写的一个php基于phpQuery的通用采集类

    <?php
     
      /**
      *通用列表采集类
      *版本V1.3
      *作者:JAE
      *博客:http://blog.jaekj.com
      */
        require_once '../phpQuery/phpQuery/phpQuery.php';
        class QueryList{
             
            private $pageURL;
             private $regArr = array();
             public $jsonArr = array();
             private $regRange;
             private $html;
             /************************************************
             * 参数: 页面地址 选择器数组 块选择器
             * 【选择器数组】说明:格式array("名称"=>array("选择器","类型"),.......)
             * 【类型】说明:值 "text" ,"html" ,"属性" 
             *【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
             *************************************************/
             function QueryList($pageURL,$regArr=array(),$regRange='')
             {
                 $this->pageURL = $pageURL;
         
                 //为了能获取https://
                   $ch = curl_init();
                    curl_setopt($ch, CURLOPT_URL,$this->pageURL);
                    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); 
                    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
                    curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); 
                    $this->html = curl_exec($ch);
                   curl_close($ch);
                    
                 if(!empty($regArr))
                 {
                 
                      $this->regArr = $regArr;
                     $this->regRange = $regRange;
                     $this->getList();
                 }
                    
             }
             function setQuery($regArr,$regRange='')
             {
                 $this->jsonArr=array();
                 $this->regArr = $regArr;
                 $this->regRange = $regRange;
                 $this->getList();
             }
            private function getList()
             {
                  
                 $hobj = phpQuery::newDocumentHTML($this->html);
                 if(!empty($this->regRange))
                 {
                 $robj = pq($hobj)->find($this->regRange);
                 
                  $i=0;
                 foreach($robj as $item)
                 {
                      
                     while(list($key,$reg_value)=each($this->regArr))
                     {
                         $iobj = pq($item)->find($reg_value[0]);
                         
                           switch($reg_value[1])
                           {
                               case 'text':
                                     $this->jsonArr[$i][$key] = trim(pq($iobj)->text());
                                     break;
                               case 'html':
                                     $this->jsonArr[$i][$key] = trim(pq($iobj)->html());
                                     break;
                               default:
                                    $this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);
                                    break;
                                
                            }
                     }
                     //重置数组指针
                     reset($this->regArr);
                     $i++;
                  }
                 }
                 else
                 {
                while(list($key,$reg_value)=each($this->regArr))
                 {
                    $lobj = pq($hobj)->find($reg_value[0]);
                        
                        
                       $i=0;
                       foreach($lobj as $item)
                       {
                           switch($reg_value[1])
                           {
                               case 'text':
                                     $this->jsonArr[$i++][$key] = trim(pq($item)->text());
                                     break;
                               case 'html':
                                     $this->jsonArr[$i++][$key] = trim(pq($item)->html());
                                     break;
                               default:
                                    $this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);
                                    break;
                                
                            }
                           
                          
                       }
                      
             
                 }
               }
             }  
             function getJSON()
             {
                 return json_encode($this->jsonArr);
             } 
             
    }

    使用演示 

    <?php
    require 'Query/QueryList.class.php';
     
     
    //采集OSC的代码分享列表,标题 链接 作者
    $url = "http://www.oschina.net/code/list";
    $reg = array("title"=>array(".code_title a:eq(0)","text"),"url"=>array(".code_title a:eq(0)","href"),"author"=>array("img","title"));
    $rang = ".code_list li";
    $hj = new QueryList($url,$reg,$rang);
    $arr = $hj->jsonArr;
    print_r($arr);
    //如果还想采当前页面右边的 TOP40活跃贡献者 图像,得到JSON数据,可以这样写
    $reg = array("portrait"=>array(".hot_top img","src"));
    $hj->setQuery($reg);
    $json = $hj->getJSON();
    echo $json . "<hr/>";
     
    //采OSC内容页内容
    $url = "http://www.oschina.net/code/snippet_186288_23816";
    $reg = array("title"=>array(".QTitle h1","text"),"con"=>array(".Content","html"));
    $hj = new QueryList($url,$reg);
    $arr = $hj->jsonArr;
    print_r($arr);
     
    //就举这么多例子吧,是不是用来做采集很方便

    这是自己写的好玩的,基于它的类似于搜索引擎API吧

    <?php
     
     /**
      *自己写的百度和谷歌搜索API
      *版本V2.0
      *作者:JAE
      *博客:http://blog.jaekj.com
      **/
    require_once 'QueryList_class.php';
       class Searcher
       {
          private $searcher;
          private $key;
          private $num;
          private $page;
          private $regArr ;
          private $regRange ;
          private $regZnum;
          public $jsonArr;
          //参数 搜索引擎 搜索关键字 返回的结果条数 第几页
          function Searcher($searcher,$key,$num,$page)
          {
              if($searcher=='baidu')
              {
                  $this->regArr = array("title"=>array("h3.t a,#ting_singlesong_box a","text"),"tCon"=>array("div.c-abstract,font:slice(0,2),div#weibo,table tr:eq(0),div.c-abstract-size p:eq(0),div.vd_sitcom_new_tinfo","text"),"url"=>array("h3.t a,#ting_singlesong_box a","href"));
                  $this->regRange = 'table.result,table.result-op';
                  $this->regZnum=array("zNum"=>array("span.nums","text"));
              }
              else if($searcher=='google')
              {
                  $this->regArr = array("title"=>array("h3.r a","text"),"tCon"=>array("span.st","text"),"url"=>array("h3.r a","href"));
                  $this->regRange = 'li.g';
                  $this->regZnum=array("zNum"=>array("div#resultStats","text"));
              }
              $this->searcher = $searcher;
              $this->key = $key;
              $this->num  = $num;
              $this->page = $page-1;
              $this->getList();
          }
          private function getList()
          {
                $s = urlencode($this->key);
                $num = $this->num;
                $start = $this->num*$this->page;
                if($this->searcher=='baidu')
                {
                    $url = "http://www.baidu.com/s?pn=$start&rn=$num&wd=$s";
                     $reg_znum='/[d,]+/';
                }
                else if($this->searcher=='google')
                {
                    $url="https://www.google.com.hk/search?filter=0&lr=&newwindow=1&safe=images&hl=en&as_qdr=all&num=$num&start=$start&q=$s";
                    $reg_znum='/([d,]+) result(s)?/';
                }
               $searcherObj = new QueryList($url,$this->regArr,$this->regRange);
              for($i=0;$i<count($searcherObj->jsonArr);$i++)
              {
                  if($this->searcher=='baidu')
                  {
                     $searcherObj->jsonArr[$i]['url'] = $this->getBaiduRealURL($searcherObj->jsonArr[$i]['url']);
                  }
                  else if($this->searcher=='google')
                  {
                      $searcherObj->jsonArr[$i]['url'] = $this->getGoogleRealURL($searcherObj->jsonArr[$i]['url']);
                  }
              }
              $this->jsonArr = $searcherObj->jsonArr ;
               
              //获取总共结果条数
              
              $searcherObj->setQuery($this->regZnum);
              $zNum = $searcherObj->jsonArr[0]['zNum'];
              preg_match($reg_znum,$zNum,$arr)?$zNum=$arr[0]:$zNum=0;
              $zNum = (int)str_replace(',','',$zNum);
              //计算总页数
                $zPage = ceil($zNum/$this->num);
                 $this->jsonArr=array('num'=>$this->num,'page'=>((int)$this->page+1),'zNum'=>$zNum,'zPage'=>$zPage,"s"=>"$this->key",'other'=>array('author'=>'JAE','QQ'=>'734708094','blog'=>'http://blog.jaekj.com'),'data'=>$this->jsonArr); 
               
               
          }
           function getJSON()
          {
              return json_encode($this->jsonArr);
          }
        private  function getBaiduRealURL($url)
         { 
            //得到百度跳转的真正地址
            $header = get_headers($url,1);
            if (strpos($header[0],'301') || strpos($header[0],'302')) 
            {
                if(is_array($header['Location'])) 
                {
                    //return $header['Location'][count($header['Location'])-1];
                    return $header['Location'][0];
                }
                else
                {
                    return $header['Location'];
                }
            }
            else
            {
                return $url;
            }
         }
         private function getGoogleRealURL($url)
         {
              $reg_url = '/q=(.+)&/U';
             return  preg_match($reg_url,$url,$arr)?urldecode($arr[1]):$url;
               
         }
     }
    // $hj = new Searcher('google','oschina',20,2);
     // print_r( $hj->jsonArr);
    //效果演示地址
    //http://blog.jaekj.com//jae/demo/searcher/Searcher_class.php?searcher=baidu&s=jaekj&num=20&page=1
  • 相关阅读:
    go-go协程
    linux-pclint代码检测
    linux-32位-交叉编译openssl
    go-json类
    mysql-定时任务
    go-IO操作
    go-异常处理-error-panic-recover
    go-defer语句
    go-select
    go-指针
  • 原文地址:https://www.cnblogs.com/shanyansheng/p/5474128.html
Copyright © 2011-2022 走看看