- 学习爬虫之前, 需要先大致了解一下guzzle https://guzzle-cn.readthedocs.io/zh_CN/latest/quickstart.html#id2
- dom-crawler 简单介绍 https://blog.csdn.net/wz947324/article/details/79928853
- 使用guzzle 和 dom-crawler 实现的简单爬虫 https://www.iamle.com/archives/2202.html
-
1 <?php 2 3 /* 参考文章 https://www.iamle.com/archives/2202.html */ 4 5 require_once __DIR__.'/vendor/autoload.php'; 6 use GuzzleHttpClient; 7 use SymfonyComponentDomCrawlerCrawler; 8 9 // $url = 'https://movie.douban.com/subject/25812712/?from=showing'; 10 // //下载网页内容 11 // $client = new Client([ 12 // 'timeout' => 10, 13 // 'headers' => ['User-Agent' => 'Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)', 14 // ], 15 // ]); 16 // $response = $client->request('GET', $url)->getBody()->getContents(); 17 // print_r($response); exit; 18 19 print_r(json_encode(Spider(), JSON_UNESCAPED_UNICODE)); 20 //print_r(Spider()); 21 22 function Spider() 23 { 24 //需要爬取的页面 25 $url = 'https://movie.douban.com/subject/25812712/?from=showing'; 26 27 //下载网页内容 28 $client = new Client([ 29 'timeout' => 10, 30 'headers' => ['User-Agent' => 'Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)', 31 ], 32 ]); 33 /* 原格式输出 */ 34 echo '<pre>'; 35 $response = $client->request('GET', $url)->getBody()->getContents(); 36 37 //进行XPath页面数据抽取 38 $data = []; //结构化数据存本数组 39 $crawler = new Crawler(); 40 $crawler->addHtmlContent($response); 41 42 try { 43 //电影名称 44 //网页结构中用css选择器用id的比较容易写xpath表达式 45 $data['name'] = $crawler->filterXPath('//*[@id="content"]/h1/span[1]')->text(); 46 //电影海报 47 $data['cover'] = $crawler->filterXPath('//*[@id="mainpic"]/a/img/@src')->text(); 48 //导演 49 $data['director'] = $crawler->filterXPath('//*[@id="info"]/span[1]/span[2]')->text(); 50 //多个导演处理成数组 51 $data['director'] = explode('/', $data['director']); 52 //过滤前后空格 53 $data['director'] = array_map('trim', $data['director']); 54 55 //编剧 56 $data['cover'] = $crawler->filterXPath('//*[@id="info"]/span[2]/span[2]/a')->text(); 57 //主演 58 $data['mactor'] = $crawler->filterXPath('//*[@id="info"]/span[contains(@class,"actor")]/span[contains(@class,"attrs")]')->text(); 59 //多个主演处理成数组 60 $data['mactor'] = explode('/', $data['mactor']); 61 //过滤前后空格 62 $data['mactor'] = array_map('trim', $data['mactor']); 63 64 //上映日期 65 $data['rdate'] = $crawler->filterXPath('//*[@id="info"]')->text(); 66 //使用正则进行抽取 67 preg_match_all("/(d{4})-(d{2})-(d{2})(.*?)/", $data['rdate'], $rdate); //2017-07-07(中国大陆) / 2017-06-14(安锡动画电影节) / 2017-06-30(美国) 68 $data['rdate'] = $rdate[0]; 69 //简介 70 //演示使用class选择器的方式 71 $data['introduction'] = trim($crawler->filterXPath('//div[contains(@class,"indent")]/span')->text()); 72 73 //演员 74 //本xpath表达式会得到多个对象结果,用each方法进行遍历 75 //each是传入的参数是一个闭包,在闭包中使用外部的变量使用use方法,并使用变量指针 76 $crawler->filterXPath('//ul[contains(@class,"celebrities-list from-subject")]/li')->each(function (Crawler $node, $i) use (&$data) { 77 $actor['name'] = $node->filterXPath('//div[contains(@class,"info")]/span[contains(@class,"name")]/a')->text(); //名字 78 $actor['role'] = $node->filterXPath('//div[contains(@class,"info")]/span[contains(@class,"role")]')->text(); //角色 79 $actor['avatar'] = $node->filterXPath('//a/div[contains(@class,"avatar")]/@style')->text(); //头像 80 //background-image: url(https://img3.doubanio.com/img/celebrity/medium/5253.jpg) 正则抽取头像图片 81 preg_match_all("/((https|http|ftp|rtsp|mms)?://)[^s]+.(jpg|jpeg|gif|png)/", $actor['avatar'], $avatar); 82 $actor['avatar'] = $avatar[0][0]; 83 //print_r($actor); 84 $data['actor'][] = $actor; 85 }); 86 87 } catch (Exception $e) { 88 89 } 90 91 return $data; 92 93 }
- 自己尝试着写了一个爬笑话的程序
1 <?PHP 2 require_once __DIR__.'/vendor/autoload.php'; 3 use GuzzleHttpClient; 4 use SymfonyComponentDomCrawlerCrawler; 5 6 7 /* 下载笑话网页 */ 8 echo '<pre>'; 9 xiaohua(); 10 11 function xiaohua($param=[]) 12 { 13 $url = 'http://xiaodiaodaya.cn/wapindex.aspx?classid=602'; /* 最新笑话 */ 14 15 /* 使用guzzle请求 */ 16 $client = new Client(); 17 $response = $client->request('GET', $url)->getBody()->getContents(); 18 // print_r($response); 19 20 /* 提取内容, 使用xpath语法练习提取内容 */ 21 $data = []; /* 保存最终的数据, 并存储到数据库中 */ 22 $crawler = new Crawler(); 23 $crawler->addHtmlContent($response); 24 // print_r($crawler); 25 26 /* 爬取最新笑话的a标签url, 根据此url可以跳转到笑话详情页 */ 27 for($i=1; $i<=88; $i++){ 28 $second_url[] = $crawler->filterXPath('//*[@id="main"]/div/div[2]/div/a['.$i.']')->attr('href'); 29 /* 下面的方法也是获取url, 结果一样 */ 30 // $link = $crawler->filterXPath('//*[@id="main"]/div/div[2]/div/a['.$i.']')->link(); 31 // $uri = $link->getUri(); 32 } 33 34 /* 再根据second_url爬取最新笑话详情 */ 35 foreach ($second_url as $key => $value) { 36 $crawler2 = new Crawler(); 37 $content = $client->request('get', $value)->getBody()->getContents(); 38 $crawler2->addHtmlContent($content); 39 $text = $crawler2->filterXPath('//*[@id="main"]/div/div[1]/text()['.$key.']')->document->textContent; 40 $pattern = '/([d]、(.*))相关笑话/'; 41 preg_match_all($pattern, $text, $matches); 42 print_r($matches[1]); 43 // if($key == 2) 44 // break; 45 } 46 }
在写代码的过程中也遇到了一些问题, (1) 因为涉及到了递归爬取(就是爬一个笑话列表页面, 获取到链接, 在爬链接的内容即笑话详情), new Crawler() 必须在循环体中执行, 感觉这样会浪费内存, 100个url, 遍历100次, 实例化100个对象. 但是不这样写会报错Uncaught InvalidArgumentException: Attaching DOM nodes from multiple documents in the same crawler is forbidden. 猜测是因为调用同一个对象的addHtmlContent()方法, 是不行的. (2) 在打印$text = $crawler2->filterXPath('//*[@id="main"]/div/div[1]/text()['.$key.']')->document->textContent;内容的时候, 这个document 属性是私有的, 我修改了源代码, 将private改成public, 因为实在找不到方法打印出结果. 最终结果截图(数据太多, 只截取部分) (3) 感觉使用这个crawler爬虫太费劲, 而且结果我不知道怎么获取(不修改源代码的话)并且没有文档可参考, 所以不要用这个爬虫
-