zoukankan      html  css  js  c++  java
  • curl抓取

    <?php
    header("content-type:text/html;charset=utf8");
    set_time_limit(0);
     
    //=================================工具函数=====================
    function real_url($current_url, $base_url='') {
        $data = parse_url($current_url);
        if ( ! isset($data['host'])) {
            $current_url = $base_url . $base_url;
        }
        return $current_url;
    }
     
     
    // 源文件下载地址 : http://curlmulti.com/index/download/CurlMulti
    if ( ! is_file('CurlMulti.php')) {
        $phpQuery = file_get_contents('http://curlmulti.com/index/download/CurlMulti');
        file_put_contents("./CurlMulti.php", $phpQuery);
    }
    require 'CurlMulti.php';
    // 源文件下载地址 : http://curlmulti.com/index/download/phpQuery
    if ( ! is_file('phpQuery.php')) {
        $phpQuery = file_get_contents('http://curlmulti.com/index/download/phpQuery');
        file_put_contents("./phpQuery.php", $phpQuery);
    }
    require 'phpQuery.php';
     
    class myDebug {
        static $start;
        static $end;
        static $times;
        static function microtime_float(){
            list ($usec, $sec) = explode(" ", microtime());
            return ((float) $usec + (float) $sec);
        }
        public function set_start() {
            self::$start = self::microtime_float();
        }
        public function set_end() {
            self::$end = self::microtime_float();
        }
        public function report() {
            return self::$end - self::$start;
        }
    }
    class myCurl {
     
        public $curl;
        public $article_list;
        protected $cacheDir;
        protected $pageCount;
        protected $articleCount;
        protected $request;
         
     
        public function __construct(request $request){
            $this->_init_request($request);
            $this->_init_curl();
        }
        protected function _init_var() {
            $this->pageCount = 0;
        }
        protected function _init_request(request $request) {
            $this->request = $request;
            // $this->request->cache_path = __DIR__ . '/sjm_cache/';
            // $this->request->fetch_item_query = '#J_posts_list .subject .title a';
            // $this->request->fetch_page_current = '.J_page_wrap .pages strong';
            // $this->request->base_url = 'http://bbs.sijiaomao.com/index.php?m=bbs&c=thread&fid=10&page=%d';
     
        }
        protected function _init_curl() {
            $this->curl = new CurlMulti();
            $this->cacheDir = $this->request->cache_path . 'cache';
            if (! is_dir($this->cacheDir)) {
                mkdir($this->cacheDir, 777, true);
            }
            $this->cacheDataDir =  $this->request->cache_path . 'data';
            if (! is_dir($this->cacheDataDir)) {
                mkdir($this->cacheDataDir, 777, true);
            }
            $this->curl->cache = array(
                'dir' => $this->cacheDir,
                'on' => true,
                'expire' => 3600 * 24
            );
            $this->curl->maxThread = 10;
            $this->curl->opt[CURLOPT_CONNECTTIMEOUT] = 10;
        }
     
        public function fetch_list(){
            $this->_add_fetch_list_url();
            $this->curl->start();
            $this->_save_article_list();
        }
        public function fetch_article() {
            foreach ($this->article_list as $k => $v) {
                $this->curl->add(array(
                    'url' => $v['href']
                ), array($this, '_success_article'));
            }
            $this->curl->start();
        }
        public function display() {
            printf(
                " 共抓取%d个页面 文章列表%d篇 相关文章%d篇 文章目录存放在%s ",
                $this->pageCount + $this->articleCount,
                $this->pageCount,
                count($this->article_list),
                $this->cacheDataDir . '/list.php'
            );
        }
        public function fetch() {
            return sprintf(
                " 共抓取%d个页面 文章列表%d篇 相关文章%d篇 文章目录存放在%s ",
                $this->pageCount + $this->articleCount,
                $this->pageCount,
                count($this->article_list),
                $this->cacheDataDir . '/list.php'
            );
        }
        public function _add_fetch_list_url($page = 1){
            $this->curl->add(
                array(
                    'url' => sprintf($this->request->base_url, $page),
                    'args' => array('page' => $page)
                ),
                array($this, '_success_list')
            );
        }
        protected function _save_article_list() {
            $res = file_put_contents(
                $this->cacheDataDir . '/list.php',
                sprintf("<?php return %s;",
                var_export($this->article_list, true))
            );
            // 相关性排序整理
            /*uasort($this->article_list, function ($a, $b){
                preg_match_all('#([a-zA-Z]+)#is', $a['title'], $match);
                $a_title = strtoupper(implode("", $match[0]));
                 
                preg_match_all('#([a-zA-Z]+)#is', $b['title'], $match);
                $b_title = strtoupper(implode("", $match[0]));
                return $a_title > $b_title;
            });*/
            $res = file_put_contents(
                $this->cacheDataDir . '/list.txt',
                array_map(function($a_list){
                    $str = sprintf(
                        "标题:%s 超链接:%s ",
                        str_replace(" ", "", $a_list['title']),
                        $a_list['href']
                    );
                    return $str;
                }, $this->article_list)
            );
            return $res;
        }
        public function _success_article($r, $param){
            ++$this->articleCount;
        }
        public function _success_list($r, $param){
            ++$this->pageCount;
            $html = phpQuery::newDocumentHTML($r['content']);
            $list = $html[$this->request->fetch_item_query];
            foreach ($list as $v) {
                $v = pq($v);
     
                $item = array(
                    "title" => $v->attr('title') ? $v->attr('title') : $v->text(),
                    "href" => real_url($v->attr('href'), $this->request->base_url)
                );
                $this->article_list[md5($item['href'])] = $item;
            }
            $page_current = $html[$this->request->fetch_page_current];
            if ($page_current->next()->text()) {
                $page = ++ $param['page'];
                $this->_add_fetch_list_url($page);
            }
             
            phpQuery::unloadDocuments();
        }
    }
    class request{
        /*url*/
        public $base_url;
        /*缓存文件路径*/
        public $cache_path;
        /*获取元素的CSS选择器*/
        public $fetch_item_query;
        /*分页当前页面元素的CSS选择器*/
        public $fetch_page_current;
     
        static $instance;
        static public function getInstance() {
            if (empty(self::$instance)) {
                self::$instance = new self;
            }
     
            return self::$instance;
        }
        private function __construct() {
            $this->_init_base();
        }
        function _init_base() {
            $this->cache_path = __DIR__ . '/'. trim($_POST['cache_path'], '/') .'/';
            $this->fetch_item_query = $_POST['fetch_item_query'];
            $this->fetch_page_current = $_POST['fetch_page_current'];
            $this->base_url = $_POST['url'];
        }
        function request() {
            if (strstr($_POST['url'], '?')) {
                $url = sprintf("%s&auth=%s", $_POST['url'], $auth);
            } else {
                $url = sprintf("%s?auth=%s", $_POST['url'], $auth);
            }
            $param = array();
            if (isset($_POST['param'])) {
                foreach($_POST['param'] as $k => $item) {
                    if (!empty($item['method']) && !empty($item['name'])) {
                        $param[$item['method']][$item['name']] = $item['value'];
                    }
                }
            }
            if (isset($param['get']) && !empty($param['get'])) {
                foreach ($param['get'] as $name => $value) {
                    $url = sprintf("%s&%s=%s", $url, $name, $value);
                }
            }
            $post_data = null;
            if (isset($param['post']) && !empty($param['post'])) {
                $post_data = $param['post'];
            }
        }
    }
    ?>
     
     
     
     
    <?php
    if (isset($_POST['submit'])) {
        $request = request::getInstance();
        $myCurl = new myCurl($request);
        myDebug::set_start();
        $myCurl->fetch_list();
         
        //$myCurl->fetch_article();
        myDebug::set_end();
    } else {
        $_POST['url'] = 'http://www.oschina.net/code/tag/php?show=time&lang=&catalog=&p=%d';
        $_POST['cache_path'] = 'oschina';
        $_POST['fetch_item_query'] = '.code_list ul li .code_title > a';
        $_POST['fetch_page_current'] = '.pager li.current';
    }
    ?>
     
     
     
     
    <html lang="zh-CN">
    <head>
        <meta charset="utf-8">
        <title>页面爬虫</title>
        <link href="http://cdn.bootcss.com/bootstrap/3.2.0/css/bootstrap.min.css" rel="stylesheet">
        <link href="http://cdn.bootcss.com/font-awesome/4.1.0/css/font-awesome.min.css" rel="stylesheet">
        <link href="http://static.bootcss.com/www/assets/css/site.min.css?v3" rel="stylesheet">
        <link href="http://static.bootcss.com/www/assets/ico/favicon.png" rel="shortcut icon">
        <script src="http://cdn.bootcss.com/jquery/1.11.1/jquery.min.js"></script>
    </head>
    <body>
    <div class="container">
            <div class="row row-offcanvas row-offcanvas-right">
                <div class="col-xs-12 col-sm-12">
                    <div class="row" >
                        <div class="col-xs-1 col-lg-4">
                            <h1>页面爬虫</h1>
                            <div class="thumbnail">
                            <form class="form-signin" action="" method="post">
                                <b>请填URL</b>:
                                <input value="<?php echo isset($_POST['url'])?$_POST['url']:'';?>" class="form-control" placeholder="填写完整地址,以http://开头" type="text" name="url" required><br>
                                <b>请填缓存文件路径</b>:
                                <input value="<?php echo isset($_POST['url'])?$_POST['cache_path']:'';?>" class="form-control" placeholder="填写缓存文件路径" type="text" name="cache_path" required><br>
                                <b>请填获取元素的CSS选择器</b>:
                                <input value="<?php echo isset($_POST['url'])?$_POST['fetch_item_query']:'';?>" class="form-control" placeholder="填写获取元素的CSS选择器" type="text" name="fetch_item_query" required><br>
                                <b>请填分页当前页面元素的CSS选择器</b>:
                                <input value="<?php echo isset($_POST['url'])?$_POST['fetch_page_current']:'';?>" class="form-control" placeholder="填写分页当前页面元素的CSS选择器" type="text" name="fetch_page_current" required><br>
                                <?php if (isset($_POST['param']) && !empty($_POST['param'])) :?>
                                    <?php foreach ($_POST['param'] as $k => $item) :?>
                                        <?php if (!empty($item['method']) && !empty($item['name'])) :?>
                                            <div class="thumbnail">
                                                <b>参数name</b>:
                                                <input value="<?php echo $item['name'];?>" placeholder="请填写" type="text" name="param[<?php echo $k;?>][name]"><br>
                                                <b>参数value</b>:
                                                <input value="<?php echo $item['value'];?>" placeholder="请填写" type="text" name="param[<?php echo $k;?>][value]"><br>
                                                <b>请求方式</b>:
                                                <label><input <?php if($item['method']=='get'):?>checked<?php endif;?> value="get" type="radio" name="param[<?php echo $k;?>][method]">get</label>
                                                <label><input <?php if($item['method']=='post'):?>checked<?php endif;?> value="post" type="radio" name="param[<?php echo $k;?>][method]">post</label><br />
                                                <a href="#" onclick="del_param(this)">删除</a>
                                            </div>
                                        <?php endif;?>
                                    <?php endforeach;?>
                                <?php endif;?>
                                 
                                <input type="button" name="add_param" id="add_param" value="添加参数" class="btn btn-lg btn-primary btn-block"><br />
                                <input type="submit" name="submit" value="下载" class="btn btn-lg btn-primary btn-block"><br />
                            </form>
                            </div>
                        </div>
                        <div class="col-xs-1 col-lg-8">
                            <?php
                                if (isset($_POST['submit'])) {
                                    echo "<pre>";
                                    echo "请求时间:";
                                    var_dump(myDebug::report());
                                     
                                    echo "<br />请求url:";
                                    isset($request->base_url) && var_dump($request->base_url);
                                     
                                    echo "<br />请求参数:";
                                    isset($param) && var_dump($param);
                                     
                                    echo "<hr />结果:";
                                    var_dump($myCurl->fetch());
                                     
                                    echo "</pre>";
                                }
                            ?>
                        </div>
                    </div>
                </div>
            </div>
            <hr />
        </div>
        <div class="blog-masthead">
            <div class="container">
                <nav class="blog-nav">
                    <p class="blog-nav-item">&copy; Company 2014</p>
                </nav>
            </div>
        </div>
    </body>
    </html>
     
    <script>
        $("#add_param").click(function(){
            var input_len = $("form input").size();
            input_len++;
            $(this).before('
                <div class="thumbnail">
                    <b>参数name</b>:
                    <input value="" placeholder="请填写" type="text" name="param['+ input_len +'][name]"><br>
                    <b>参数value</b>:
                    <input value="" placeholder="请填写" type="text" name="param['+ input_len +'][value]"><br>
                    <b>请求方式</b>:
                    <label><input checked value="get" type="radio" name="param['+ input_len +'][method]">get</label>
                    <label><input value="post" type="radio" name="param['+ input_len +'][method]">post</label><br />
                    <a href="#" onclick="del_param(this)">删除</a>
                </div>
            ');
        });
        function del_param(obj) {
            $(obj).parent().remove();
        }
    </script>

  • 相关阅读:
    mysql对表操作的各种语句
    Map遍历两种方式
    hibernate3
    Spring、mybaits整合
    mybaits注解
    mybaits 框架运用
    mybatis入门
    限制文本框字符数
    Unity3D Mathf函数
    Unity3d 粒子工具注释
  • 原文地址:https://www.cnblogs.com/lemon66/p/4112520.html
Copyright © 2011-2022 走看看