zoukankan      html  css  js  c++  java
  • 记一次爬取豆瓣电影详情

    帮朋友爬取豆瓣电影的介绍里面的内容,废话不多说了,上代码

    ----

    简单的爬取分为两个文件

    fectch.php

    <?php
    require "./getfunction.php";
    $name = "复仇者联盟3:无限战争";
    $url = "https://movie.douban.com/j/subject_suggest?q=".$name;
    $curl = curl_init(); // 启动一个CURL会话
    curl_setopt($curl, CURLOPT_URL, $url);
    curl_setopt($curl, CURLOPT_HEADER, 0);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查
    curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);  // 从证书中检查SSL加密算法是否存在
    $tmpInfo = curl_exec($curl);     //返回api的json对象
    $tmpInfo = json_decode($tmpInfo);
    
    // var_dump($tmpInfo);die;
    $arrat_res = [];
    foreach ($tmpInfo as $v) {
        if ($name == $v->title) {
            $arrat_res[] = $v;
        }
    }
    if (empty($arrat_res)) {
       $data = [
         "code"=>10001,
         "msg"=>"暂无片源信息"
       ];
       echo json_encode($data);die;
    }
    $url2 = $arrat_res[0]->url;
    curl_setopt($curl, CURLOPT_URL, $url2);
    curl_setopt($curl, CURLOPT_HEADER, 0);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查
    curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);  // 从证书中检查SSL加密算法是否存在
    $tmpInfo2 = curl_exec($curl);     //返回api的json对象
    
    if (!$tmpInfo2) {
        echo "<br />cURL error number:" .curl_errno($curl);
        echo "<br />cURL error:" . curl_error($curl);
        exit;
    }
    //创建一个DomDocument对象,用于处理一个HTML
    $dom = new DOMDocument();
    //从一个字符串加载HTML
    @$dom->loadHTML($tmpInfo2);
    //使该HTML规范化
    $dom->normalize();
    
    //用DOMXpath加载DOM,用于查询
    $xpath = new DOMXPath($dom);
    
    //获取导演信息
    $directors = $xpath->evaluate("//*[@id='info']/span[1]/span[2]/a/text()");
    $directors_res = "";
    for ($i = 0; $i < $directors->length; $i++) {
        $director = $directors->item($i);
        $director = $director->nodeValue;
        if ($i != 0) {
          $directors_res = $directors_res.",".$director;
        }else{
          $directors_res = $director;
        }
    
    }
    
    //名称
    $name = $xpath->evaluate("//*[@id='content']/h1/span[1]/text()");
    if (!empty($name->length)) {
        $name = $name->item(0)->nodeValue;
    }
    
    //年份
    $years = $xpath->evaluate("//*[@id='content']/h1/span[2]/text()");
    if (!empty($years->length)) {
      $years = $years->item(0)->nodeValue;
    }
    
    //海报
    //*[@id="mainpic"]/a/img
    $img = $xpath->evaluate("//*[@id='mainpic']/a/img/@src");
    if (!empty($img->length)) {
        $img = $img->item(0)->nodeValue;
    }
    // var_dump($img);die;
    //是否上映
    //*[@id="interest_sectl"]/div/div[2]/div/div[2]
    $is_on = $xpath->evaluate("//*[@id='interest_sectl']/div/div[2]/div/div[2]");
    if (!empty($is_on->length)) {
        $is_on = $is_on->item(0)->nodeValue;
        if (trim($is_on) == "尚未上映") {
          $is_on = 1;
        }else{
          $is_on = 2;
        }
    }
    // var_dump($is_on);die;
    
    //获取编剧信息
    $screenwriters = $xpath->evaluate("//*[@id='info']/span[2]/span[2]/a/text()");
    $screenwriters_res = "";
    for ($i = 0; $i < $screenwriters->length; $i++) {
    
        $screenwriter = $screenwriters->item($i);
        $screenwriter = $screenwriter->nodeValue;
        if ($i != 0) {
           $screenwriters_res = $screenwriters_res. ",".$screenwriter;
        }else{
           $screenwriters_res = $screenwriter;
        }
    }
    
    //获取演员信息
    //*[@id="info"]/span[3]/span[2]/span[1]/a
    $actors = $xpath->query("//*[@id='info']/span[3]/span[2]");
    $actors_res = "";
    for ($i = 0; $i < $actors->length; $i++) {
        $actor = $actors->item($i);
        $actor = $actor->nodeValue;
        if ($i != 0) {
           $actors_res = $actors_res. ",".$actor;
        }else{
           $actors_res = $actor;
        }
    }
    
    // $types = $xpath->query("//*[@id='info']/span[30]");
    // var_dump($types->item(0)->nodeValue);die;
    //获取类型
    $getfunction = new getFunction();
    $sear_res = $getfunction->getRes(5,"制片国家/地区:",$xpath);
    $types_res = $sear_res["res"];
    $num = $sear_res["num"];
    
    
    
    //获取语言
    $attr = [];
    $langs = $xpath->evaluate("//*[@id='info']/text()");
    for ($i = 0; $i < $langs->length; $i++) {
        $lang = $langs->item($i);
        $lang = $lang->nodeValue;
        if (preg_match('/[x{4e00}-x{9fa5}]/u', $lang)>0) {
            $attr [] = $lang;
        }
    }
    // var_dump($attr);die;
    // if (count($attr) == 3) {
    //   // code...
    // }
    if ($is_on == 1) {
      $show_res = "";
      $sear2_res = $getfunction->getRes($num+4,"又名:",$xpath);
      $time_res = $sear2_res["res"];
      $num = $sear2_res["num"];
    
    }else{
      //获取上映时间
      $sear2_res = $getfunction->getRes($num+4,"片长:",$xpath);
      $time_res = $sear2_res["res"];
      $num = $sear2_res["num"];
    
      //时长
      $sear3_res = $getfunction->getRes($num+1,"又名:",$xpath);
      $show_res = $sear3_res["res"];
      $num = $sear3_res["num"];
    }
    
    
    if (count($attr) == 4) {
      $show_res = $show_res.$attr[2];
      $country = $attr[0];
      $languages = $attr[1];
      $byname = $attr[3];
    }else{
      $country = $attr[0];
      $languages = $attr[1];
      $byname = $attr[2];
    }
    
    $imbd = "";
    $urlim = $xpath->evaluate("//*[@id='info']/a[2]/@href");
    if (!empty($urlim->length)) {
      $urlim = $urlim->item(0)->nodeValue;
      //获取url
      $urls = "";
      $urls = $xpath->evaluate("//*[@id='info']/a[1]/@href");
      if (!empty($urls->length)) {
        $urls = $urls->item(0)->nodeValue;
      }
    }else{
      $urls = "";
      $urlim = $xpath->evaluate("//*[@id='info']/a[1]/@href");
      if (!empty($urlim->length)) {
        $urlim = $urlim->item(0)->nodeValue;
      }
    }
    
    
    
    $final_res = [
      "all_name" => $name.$years,
      "name" => $name,
      "year" => $years,
      "img" => $img,
      "directors" => $directors_res,
      "screenwriters" => $screenwriters_res,
      "actors" => $actors_res,
      "types" => $types_res,
      "web_url" => $urls,
      "country" => $country,
      "languages" => $languages,
      "ontime" => $time_res,
      "showtime" => $show_res,
      "byname" => $byname,
      "imbd" => $urlim
    ];
    
    $return = ["code"=>0, "msg"=>"抓取成功", "data"=>$final_res ];
    echo json_encode($return);
    

    getfunction.php

    <?php
    class getFunction{
      public static function getRes($start,$key,$xpath){
        $res = "";
        $num = "";
        // $key = "官方网站:";
        for($i = $start; $i<30; $i++ ){
          $types = $xpath->query("//*[@id='info']/span[".$i."]");
          if (!empty($types->length)) {
            $info_res = $types->item(0)->nodeValue;
            if ($info_res == $key) {
              $num = $i;
            }elseif ($info_res == "官方网站:") {
              $num = $i;
            }else{
              if(empty($num)){
                if ($i != $start) {
                   $res = $res. ",".$info_res;
                }else{
                   $res = $info_res;
                }
              }
            }
          }
        }
        $data = ["res"=>$res,"num"=>$num];
        return $data;
      }
    }
    

    效果图

    testimg

  • 相关阅读:
    Android sdk 下载路径
    centos修改用户用户组
    centos7 通过shell切换root用户
    java 服务上传图片到linux没有读写权限
    Mybatis第二天
    Mybatis第一天
    反射
    注解
    多线程第二天
    java---过滤器、监听器
  • 原文地址:https://www.cnblogs.com/jhcyzxx/p/10480044.html
Copyright © 2011-2022 走看看