zoukankan      html  css  js  c++  java
  • php爬虫 phpspider

    <?php
    /**
     * Created by PhpStorm.
     * User: brady
     * Date: 2016/12/9
     * Time: 17:32
     */
    ini_set("memory_limit", "1024M");
    require dirname(__FILE__).'/../core/init.php';
    
    $url = "http://www.epooll.com/archives/806/";
    $html = requests::get($url);
    // 抽取文章标题
    $selector = "//*[@id="content"]/div[1]/div[1]/h1/a";
    
    $title = selector::select($html, $selector);
    // 检查是否抽取到标题
    // 抽取文章作者
    $selector = "//*[@id="content"]/div[1]/div[1]/h6/span[1]";
    $author = selector::select($html, $selector);
    // 检查是否抽取到作者
    // 去掉 作者:
    $author = str_replace("作者:", "", $author);
    //发布时间
    $selector = "//*[@id="content"]/div[1]/div[1]/h6/span[2]";
    $time = selector::select($html, $selector);
    $time = str_replace("发布时间:",'', $time);
    $time  = date("Y-m-d H:i:s",strtotime($time));
    // 抽取文章内容
    $selector = "//*[@id="content"]/div[1]/div[2]";
    $content = selector::select($html, $selector);
    // 检查是否抽取到内容
    $data = array(
        'article_title' => $title,
        'article_author' => $author,
        'article_content' => $content,
    );
    // 查看数据是否正常
    $res = db::insert("content", $data);
    var_dump($res);
    

      

  • 相关阅读:
    三范式
    解决Linux下乱码
    ER概念模型
    20140607
    PHP Fatal error: Class 'Yaf_Application' not found
    PHP流式读取XML文件
    php反射的使用
    wget 和curl 进行post数据
    crontab
    Leetcode OJ: Gray Code
  • 原文地址:https://www.cnblogs.com/brady-wang/p/6150558.html
Copyright © 2011-2022 走看看