zoukankan      html  css  js  c++  java
  • php采集远程文章简单类

    <?php
    /**
    * 采集类
    * @author Milkcy QQ:9877633
    * @copyright (C) 2012-2015 TCCMS.COM
    * @lastmodify 2012-07-10 14:00
    */
    class gather {

    public $pagestring = '';
    private $db;

    function __construct() {
    global $db;
    $this->db = $db;
    }

    function geturlfile($url) {
    $url = trim($url);
    $content = '';
    if (extension_loaded('curl')) {
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    $content = curl_exec($ch);
    curl_close($ch);
    } else {
    $content = file_get_contents($url);
    }
    return trim($content);
    }

    function get_all_url($code) {
    preg_match_all('/<a.+?href=["|\']?([^>"\' ]+)["|\']?\s*[^>]*>([^>]+)<\/a>/is', $code, $arr);
    return array('name' => $arr[2], 'url' => $arr[1]);
    }

    function get_sub_content($str, $start, $end) {
    $start = trim($start);
    $end = trim($end);
    if ($start == '' || $end == '') {
    return $str;
    }
    $str = explode($start, $str);
    $str = explode($end, $str[1]);
    return $str[0];
    }

    function vd($var) {
    echo "<div style=\"border:1px solid #ddd;background:#F7F7F7;padding:5px 10px;\">\r\n";
    echo "<pre style=\"font-family:Arial,Vrinda;font-size:14px;\">\r\n";
    var_dump($var);
    echo "\r\n</pre>\r\n";
    echo "</div>";
    }

    }

    ?>

    <?php
    define('ROOT_PATH', str_replace('\\', '/', dirname(__FILE__)));
    include ROOT_PATH."/gather.class.php";
    set_time_limit(0);
    header("Content-type: text/html; charset=gb2312");
    //目标网址
    $url = 'http://news.163.com/special/00013C0O/guojibjtj_03.html';
    //实例化采集机器
    $gather = new gather();
    //获取目标网址HTML
    $html = $gather->geturlfile($url);
    //定义采集列表区间
    $start = '<div class="bd clearfix">';
    $end = '<div class="pages-1 mt25">';
    //获取区间内的文章URL和TITLE
    $code = $gather->get_sub_content($html, $start, $end);
    $newsAry = $gather->get_all_url($code);
    //打印出结果
    //$gather->vd($newsAry);
    $tarGetUrl = $newsAry['url'][0];
    //获取目标网址HTML
    $html = $gather->geturlfile($tarGetUrl);
    //定义采集列表区间
    $start = '<div id="endText">';
    $end = '<span class="cDGray right" style="white-space:nowrap;">';
    //获取区间内的文章URL和TITLE
    $code = $gather->get_sub_content($html, $start, $end);
    $killHtml = '<iframe src="http://g.163.com/r?site=netease&affiliate=news&cat=article&type=tvscreen200x300&location=1" width="200" height="300" frameborder="no" border="0" marginwidth="0" marginheight="0" scrolling="no"></iframe>';
    $killHtml2 = '<a href="http://news.163.com/"><img src="http://img1.cache.netease.com/cnews/img07/end_i.gif" alt="netease" width="12" height="11" border="0" class="icon" /></a>';
    $code = str_replace($killHtml, "", $code);
    $code = str_replace($killHtml2, "", $code);
    $gather->vd($code);
    ?>

  • 相关阅读:
    LeetCode题解——冗余连接(并查集)——java实现
    两数之和的问题
    强引用、软引用、弱引用、虚引用——4中引用的理解
    手写死锁程序实例
    使用阻塞队列实现生产者消费者问题
    ABC三个线程交替打印10遍,要求A打印5次,B打印10次,C打印15次
    使用jstack查看线程情况解决cpu飙高问题
    ES 【elasticsearch】
    C# 正则
    领域驱动设计 浅析VO、DTO、DO、PO的概念、区别和用处等资料链接(草稿)
  • 原文地址:https://www.cnblogs.com/lygsbbs/p/4372483.html
Copyright © 2011-2022 走看看