zoukankan      html  css  js  c++  java
  • 实验楼的php比赛题,网页数据提取。

    实验楼的php比赛题,网页数据提取。

    题目的地址:https://www.shiyanlou.com/contests/lou5/challenges

    以下代码是题目的答案

    <?php
    header("Content-Type:text/html;charset=utf-8");
    class Crawler{
    	 private $content;
    	 private $data;
    	 static private $mysql;
    
    	 public function __construct(){
    	 	echo "开始爬取内容....";
    	 }
    
    	 public function loadFile($file_path){
    	 	echo "正在加载文件";
    	 	$this->content = file_get_contents($file_path);
    	 }
    
    	 public function parseCourseBody(){
    	 	$regex = "/<body[^>]*?>(.*s*?)</body>/is";
    	 	if(preg_match_all($regex, $this->content, $matches)){
    	 		$this->content = $matches[0];
    	 	}
    	 }
    
    	 public function parseContent(){
    	 	echo "开始解析内容...<br/>";
    	 	$this->parseCourseBody();
    	 	$this->parseTitle();
    	 	$this->parseDesc();
    	 	$this->parseType();
    	 	$this->titleIsLong();
    	 	$this->saveData();
    	 	echo "解析内容结束!<br/>";
    	 }
    
    	 public function saveData(){
    	 	echo "存入数据库...<br/>";
    	 	self::$mysql = mysql_connect("localhost","root","root");
    	 	mysql_query("set names utf8");
    	 	mysql_select_db("databases",self::$mysql);
    	 	$cnames = $this->data['cnames'];
    	 	$cdescs = $this->data['cdescs'];
    	 	$ctypes = $this->data['ctypes'];
    	 	$nlongs = $this->data['nlongs'];
    	 	foreach ($cnames as $key => $value) {
    	 		$sql = "insert into `course_data`(`cname`,`cdesc`,`ctype`,`nlong`) values('".$cnames[$key]."','".$cdescs[$key]."','".$ctypes[$key]."','".$nlongs[$key]."')";
    	 		mysql_query($sql);
    	 	}
    	 	mysql_close();
    	 }
    
    	 public function parseTitle(){
    	 	echo "解析课程标题...<br/>";
    	 	$regex= "/<div class="course-name".*?>.*?</div>/ism";   
    		if(preg_match_all($regex, $this->content, $matches)){
    			$cnames = $matches[0];
    		}
    		foreach ($cnames as &$value) {
    			$value = str_replace("</div>","",str_replace("<div class="course-name">", "", $value));
    		}
    		$this->data['cnames'] = $cnames;
    	 }
    
    	 public function parseDesc(){
    	 	echo "解析课程简介...<br/>";
    	 	$regex4= "/<div class="course-desc".*?>.*?</div>/ism";   
    		if(preg_match_all($regex, $this->content, $matches)){
    			$cdescs = $matches[0];
    		}
    		foreach ($cdescs as &$value) {
    			$value = str_replace("</div>","",str_replace("<div class="course-desc">", "", $value));
    		}
    		$this->data['cdescs'] = $cdescs;
    	 }
    
    	 public function parseType(){
    	 	echo "解析课程类型...<br/>";
    	 	$regex= "/<div class="course-footer".*?>.*?</div>/ism";   
    		if(preg_match_all($regex, $this->content, $matches)){
    			$ctypes = $matches[0];
    		}
    		foreach ($ctypes as &$value) {
    			$str = str_replace("</div>","",str_replace("<div class="course-footer">", "", $value));
    			if(preg_match_all("/([x{4e00}-x{9fa5}])/u", $str, $match)){
    				$value = join("",$match[0]);
    			}else{
    				$value = "免费";
    		}
    		$this->data['ctypes'] = $ctypes;
    	 }
    
    	 public function titleIsLong(){
    	 	echo "判断课程名是否超长...<br/>";
    	 	$cnames = $this->data['cnames'];
    	 	foreach ($cnames as $value) {
    	 		$nlongs[] = mb_strlen($value) > 16 : "true" : "false";
    	 	}
    	 	$this->data['nlongs'] = $nlongs;
    	 }
    }
    $Crawler = new Crawler();
    $Crawler->loadFile("test.html");
    $Crawler->parseContent();
    
    /**
     表结构
    cname(varchar):完整的课程名
    cdesc(varchar):课程描述
    ctype(varchar):课程类型,值为 免费,会员,训练营。
    nlong(enum('true','false')):课程名是否过长,课程名称超过16字符的时候为 true,否则为 false
    
    create table `course_data`(
    	`id` int(11) not null auto_increment,
    	`cname` varchar(255) default null,
    	`cdesc` varchar(255) default null,
    	`ctype` varchar(255) default null,
    	`nlong` enum('true','false') default null,
    	primary key (`id`)
    )engine=InnoDB default charset=utf8;
    */
    

      

  • 相关阅读:
    软件工程作业--评价自己经常使用的输入法
    课堂练习-找水王
    软件工程——找水王(续)
    软件工程——评价输入法
    软件工程——《你的灯亮着吗》读书笔记
    软件工程——课堂练习“找水王”
    软件工程结队项目——智能点餐系统典型用户及用户场景分析
    软件工程课堂练习——N层电梯只停一层求乘客爬楼层数最少(基本方法+优化方法)
    软件工程课堂练习——求买书最低价格
    结队项目——智能订餐系统用户调研报告
  • 原文地址:https://www.cnblogs.com/yxhblogs/p/6878366.html
Copyright © 2011-2022 走看看