zoukankan      html  css  js  c++  java
  • thinkphp 新浪新闻采集代码演示

    <?php
    namespace HomeController;
    use ThinkController;
    
    class CollectController extends Controller {
        //采集新闻列表包括标题,链接,简介并保存
        public function index() {
            $contents = file_get_contents('http://news.sohu.com/guoneixinwen.shtml'); //搜狐国内新闻首页
            $contents = mb_convert_encoding($contents, "UTF-8", "gb2312");
            $preg = '/maxPage = (d+);/is';
            preg_match_all($preg, $contents, $arr);
            $num = $arr[1][0]; //新闻列表url最新顺序码
            $preg1 = '/(上一页|下一页|尾页|末页)/';//根据页面情况写正则
            preg_match_all($preg1, $contents, $arr1);
            if (isset($_GET['id'])) {
                if ($arr1[1] !== null && $_GET['id'] <= C('COLLECT_PAGE')) { //配置中数组采集页数
                    $url = 'http://news.sohu.com/guoneixinwen_' . ($num - $_GET['id']) . '.shtml'; //搜狐国内新闻分页
                    $id = ++$_GET['id'];
                }
                else {
                    echo '列表采集结束,将继续采集正文内容';
                    $this->index1();
                }
            }
            else {
                //实时新闻,避免重复,数据,图片要清空,也可以保存上次的页码,留下次继续,但比较复杂,这里是清空
                $this->deleteTable('think_news'); //清空表
                delFile(ROOT . '/z2/Public/Uploads/sohu/'); //清空图片
                $url = 'http://news.sohu.com/guoneixinwen.shtml'; //搜狐国内新闻首页
                $id = 1;
            }
            echo '第' . $id . '页列表采集中......';
            $contents = mb_convert_encoding(file_get_contents($url), "UTF-8", "gb2312"); //转码
            $preg = '/<h3><span class="com-num"><a target="_blank" href="#">comment num</a></span><a target="_blank" href="(http://.*)(?#链接)">(.*)(?#标题)</a></h3>s*<p>(.*)(?#简介)<a target="_blank" href="http://.*">.*</a></p>/Uims';
            preg_match_all($preg, $contents, $arr);
            $news = M('News');
            foreach ($arr[1] as $key => $value) {
                $data['title'] = $arr[2][$key];
                $data['url'] = $value;
                $data['info'] = $arr[3][$key];
                $news->create($data);
                $news->add();
            }
            echo '<script>location.href="' . U('collect/index', array('id' => $id)) . '"</script>';
        }
        //根据采采集的url采集正文内容及图片并保存
        public function index1() {
            $news = M('News');
            if (isset($_GET['cid'])) $cid = $_GET['cid'];
            else $cid = 0;
            $map['id'] = array('gt', $cid);
            $result = $news->field('id,url')->where($map)->find();
            if (null != $result) {
                $contents = mb_convert_encoding(file_get_contents($result['url']), "UTF-8", "gb2312");
                $preg = '/(<div itemprop="articleBody">.*)(?#正文)<!-- seo标签描述 -->/is';
                preg_match_all($preg, $contents, $arr);
                $contents1 = $arr[1][0]; //正文内容
                if (!!$path = $this->getPath($contents1)) { //正文内的图片远程路径数组
                    $savePath = $this->saveImage($path); //保存图片并获取本地保存绝对路径
                    $contents1 = str_ireplace($path, $savePath, $contents1); //远程图片路径替换为本地绝对路径
    
    
                }
                $news->where('id=' . $result['id'])->setField('contents', htmlspecialchars($contents1));
                echo 'id为' . $result['id'] . '的正文内容采集中......';
                $cid = ++$cid;
                echo '<script>location.href="' . U('collect/index1', array('cid' => $cid)) . '"</script>';
            }
            else {
                echo '正文内容采集结束,以下是采集内容显示';
                echo '<script>location.href="' . U('collect/index2') . '"</script>';
            }
        }
        //内容显示简页
        public function index2() {
            $news = M('News');
            $result = $news->where($map)->select();
            $this->assign('aa', $result);
            //$a = $this->buildHtml('1', HTML_PATH . '/collect/', APP_PATH . 'Admin/View/Login/index.html');
            //echo $a;
            $this->display();
    
    
        }
        //清空表
        private function deleteTable($table) {
            $sql = "TRUNCATE TABLE $table";
            M()->execute($sql);
        }
        //获取正文内容中的远程图片路径并返回,参数采集的正文内容
        private function getPath($contents) {
            $path = array();
            if ($contents == null) return false;
            $preg = '/<img src="(http://.*)" alt=.*/>/Uis';
            if (preg_match_all($preg, $contents, $arr)) {
                foreach ($arr[1] as $key => $value) {
                    $path[] = $value; //获取远程图片路径
                }
                return $path; //返回远程图片路径
            }
            return false;
        }
        //保存图片并返回本地绝对路径,参数远程图片路径数组
        private function saveImage($path) {
            if ($path == '') return false;
            $pathArr = array();
            foreach ($path as $key => $value) {
                $url = $value; //远程图片路径
                $filename = substr($value, strripos($value, '/')); //图片名.后缀
                $savePath = './Public/Uploads/sohu' . $filename; //保存路径
                ob_start(); //开启缓冲
                readfile($url); //读取图片
                $img = ob_get_contents(); //保存到缓冲区
                ob_end_clean(); //关闭缓冲
                $fp2 = @fopen($savePath, "a"); //打开本地保存图片文件
                fwrite($fp2, $img); //写入图片
                fclose($fp2);
                $pathArr[] = 'http://localhost:9096/z2/' . str_ireplace('./', '', $savePath); //保存图片绝对路径
            }
            return $pathArr; //返回本地保存绝对路径
        }
    }
    /*
    z2是我的app名
    相关代码
    //删除文件夹里的文件,放公共函数文件里
    function delFile($dir) {
        $dh = opendir($dir);
        while ($file = readdir($dh)) {
            if ($file != "." && $file != "..") {
                $fullpath = $dir . "/" . $file;
                if (!is_dir($fullpath)) {
                    unlink($fullpath);
                }
                else {
                    deldir($fullpath);
                }
            }
        }
        closedir($dh);
    }
    //入口文件
    define('ROOT',$_SERVER['DOCUMENT_ROOT']);
    //配置文件
    'COLLECT_PAGE'=> '10',//采集新闻列表页数
    //图片本地保存目录'./Public/Uploads/sohu'
    //index2模板代码
    <volist name='aa' id='vo'>
        <div style="800px;margin:0 auto;">
        <h2>{$vo.id}.{$vo.title}</h2>
            {$vo.contents|htmlString_decode}
            <p>原文链接:<a href="{$vo.url}" target="blank">{$vo.url}</a></p>
        </div><br><br><br><br><br><br><br>
    </volist>
    //表结构
    --
    -- `think_news`
    --
    CREATE TABLE IF NOT EXISTS `think_news` (
      `id` int(5) unsigned NOT NULL AUTO_INCREMENT,
      `title` varchar(255) DEFAULT NULL,
      `url` varchar(255) DEFAULT NULL,
      `info` text,
      `contents` text,
      PRIMARY KEY (`id`)
    ) ENGINE=MyISAM  DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC ;
    */
  • 相关阅读:
    C#异步编程由浅入深(一)
    基于Jira的运维发布平台的设计与实现
    kubeadm部署K8S并使用containerd做运行时
    代码阅读
    黑客攻防 1
    Linux 常用指令篇1
    期刊管理系统总结
    docker安装RabbitMQ
    Centos7安装mysql8.0教程
    java动态编译
  • 原文地址:https://www.cnblogs.com/shanyansheng/p/5474578.html
Copyright © 2011-2022 走看看