zoukankan      html  css  js  c++  java
  • php采集远程文章简单类

    <?php
    /**
     * 采集类
     * @author Milkcy 
     * @copyright            (C) 2012-2015 TCCMS.COM
     * @lastmodify             2012-07-10 14:00
     */
    class gather {
    
        public $pagestring = '';
        private $db;
    
        function __construct() {
            global $db;
            $this->db = $db;
        }
    
        function geturlfile($url) {
            $url = trim($url);
            $content = '';
            if (extension_loaded('curl')) {
                $ch = curl_init();
                curl_setopt($ch, CURLOPT_URL, $url);
                curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
                curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
                curl_setopt($ch, CURLOPT_HEADER, 0);
                $content = curl_exec($ch);
                curl_close($ch);
            } else {
                $content = file_get_contents($url);
            }
            return trim($content);
        }
    
        function get_all_url($code) {
            preg_match_all('/<a.+?href=["|\']?([^>"\' ]+)["|\']?\s*[^>]*>([^>]+)<\/a>/is', $code, $arr);
            return array('name' => $arr[2], 'url' => $arr[1]);
        }
    
        function get_sub_content($str, $start, $end) {
            $start = trim($start);
            $end = trim($end);
            if ($start == '' || $end == '') {
                return $str;
            }
            $str = explode($start, $str);
            $str = explode($end, $str[1]);
            return $str[0];
        }
    
        function vd($var) {
            echo "<div style=\"border:1px solid #ddd;background:#F7F7F7;padding:5px 10px;\">\r\n";
            echo "<pre style=\"font-family:Arial,Vrinda;font-size:14px;\">\r\n";
            var_dump($var);
            echo "\r\n</pre>\r\n";
            echo "</div>";
        }
    
    }
    
    ?>
    
    <?php
    define('ROOT_PATH', str_replace('\\', '/', dirname(__FILE__)));
    include ROOT_PATH."/gather.class.php";
    set_time_limit(0);
    header("Content-type: text/html; charset=gb2312");
    //目标网址
    $url = 'http://news.163.com/special/00013C0O/guojibjtj_03.html';
    //实例化采集机器
    $gather = new gather();
    //获取目标网址HTML
    $html = $gather->geturlfile($url);
    //定义采集列表区间
    $start = '<div class="bd clearfix">';
    $end = '<div class="pages-1 mt25">';
    //获取区间内的文章URL和TITLE
    $code = $gather->get_sub_content($html, $start, $end);
    $newsAry = $gather->get_all_url($code);
    //打印出结果
    //$gather->vd($newsAry);
    $tarGetUrl = $newsAry['url'][0];
    //获取目标网址HTML
    $html = $gather->geturlfile($tarGetUrl);
    //定义采集列表区间
    $start = '<div id="endText">';
    $end = '<span class="cDGray right" style="white-space:nowrap;">';
    //获取区间内的文章URL和TITLE
    $code = $gather->get_sub_content($html, $start, $end);
    $killHtml = '<iframe src="http://g.163.com/r?site=netease&affiliate=news&cat=article&type=tvscreen200x300&location=1" width="200" height="300" frameborder="no" border="0" marginwidth="0" marginheight="0" scrolling="no"></iframe>';
    $killHtml2 = '<a href="http://news.163.com/"><img src="http://img1.cache.netease.com/cnews/img07/end_i.gif" alt="netease" width="12" height="11" border="0" class="icon" /></a>';
    $code = str_replace($killHtml, "", $code);
    $code = str_replace($killHtml2, "", $code);
    $gather->vd($code);
    ?>
    //该片段来自于http://outofmemory.cn

    php 文章采集正则代码

    //采集html 
    function getwebcontent($url){ 
    $ch = curl_init(); 
    $timeout = 10; 
    curl_setopt($ch, CURLOPT_URL, $url); 
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
    curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1); 
    $contents = trim(curl_exec($ch)); 
    curl_close($ch); 
    return $contents; 
    } 
    
    
    //获得标题和url 
    $string = 
    getwebcontent('http://www.***.com/learn/zhunbeihuaiyun/jijibeiyun/2'); 
    //正则匹配<li>获取标题和地址 
    preg_match_all ("/<li><a href="/learn/article/(.*)">(.*)</a>/",$string, $out, PREG_SET_ORDER);
    foreach($out as $key => $value){ 
    $article['title'][] = $out[$key][2]; 
    $article['link'][] = "http://www.***.com/learn/article/".$out[$key][1]; 
    } 
    //根据url获取文章内容 
    foreach($article['link'] as $key=>$value){ 
    $content_html = getwebcontent($article['link'][$key]); 
    preg_match("/<div id=pagenum_0(.*)>[s|S]*?</div>/",$content_html,$matches); 
    $article[content][$key] = $matches[0]; 
    
    } 
    //不转码还真不能保存成文件 
    foreach($article[title] as $key=>$value){ 
    $article[title][$key] = iconv('utf-8', 'gbk', $value);//转码 
    } 
    //存入文件 
    $num = count($article['title']); 
    for($i=0; $i<$num; $i++){ 
    file_put_contents("{$article[title][$i]}.txt", $article['content'][$i]); 
    } 
    ?> 
  • 相关阅读:
    集合类--容器
    《学习之道》第十一章理解
    文件操作引出流(一)Stream和File.Create(path)
    整理文件操作(五)
    整理文件操作(四)Image.FromFile(path)
    整理文件操作(三)File.Exists(path)和new FileInfo(path).Exists
    整理文件操作(二)File和FileInfo
    整理文件操作(一)逻辑流程
    《学习之道》第十一章先理解再去记忆
    《学习之道》第十一章再次强调激发感官
  • 原文地址:https://www.cnblogs.com/kenshinobiy/p/4651829.html
Copyright © 2011-2022 走看看