zoukankan      html  css  js  c++  java
  • phpspider PHP 爬虫

    * 通过composer下载

    composer require owner888/phpspider
    

    // composer.json

    {
        "require": {
            "owner888/phpspider": "^2.1"
        }
    }
    

      

    * 去掉讨厌的注释

       https://doc.phpspider.org/demo-start.html

     ./vendor/owner888/phpspider/core/phpspider.php

    /* Do NOT delete this comment */
            // 彩蛋
            $included_files = get_included_files();
            $content = file_get_contents($included_files[0]);
            if (!preg_match("#/* Do NOT delete this comment */#", $content) || !preg_match("#/* 不要删除这段注释 */#", $content))
            {
                $msg = "Unknown error...";
                log::error($msg);
                exit;
            }
    

     删掉这段恶心的代码

      * 导入数据库文件

        

    cd ./vendor/owner888/phpspider/demo
    

      

    mysql -uroot -hlocalhost -p
    

      

    create database demo charset utf8 collate utf8_general_ci;
    . qiushibaike.sql

      

    # ************************************************************
    # Sequel Pro SQL dump
    # Version 4541
    #
    # http://www.sequelpro.com/
    # https://github.com/sequelpro/sequelpro
    #
    # Host: 127.0.0.1 (MySQL 5.7.14)
    # Database: demo
    # Generation Time: 2016-10-20 16:55:11 +0000
    # ************************************************************
    
    
    /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
    /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
    /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
    /*!40101 SET NAMES utf8 */;
    /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
    /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
    /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
    
    
    # Dump of table content
    # ------------------------------------------------------------
    
    DROP TABLE IF EXISTS `content`;
    
    CREATE TABLE `content` (
      `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
      `depth` int(11) DEFAULT NULL,
      `url` varchar(200) DEFAULT NULL,
      `article_title` varchar(20) DEFAULT NULL,
      `article_headimg` varchar(150) DEFAULT NULL,
      `article_author` varchar(20) DEFAULT NULL,
      `article_content` text,
      `article_publish_time` int(10) DEFAULT NULL,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
    
    
    
    
    /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
    /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
    /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
    /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
    /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
    /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
    View Code

    * 创建./index.php

    <?php
    require './vendor/autoload.php';
    
    use phpspidercorephpspider;
    
    $configs = [
        'name' => '糗事百科',
        'domains' => [
            'qiushibaike.com',
            'www.qiushibaike.com'
        ],
        'scan_urls' => [
            'http://www.qiushibaike.com/'
        ],
        'content_url_regexes' => [
            "http://www.qiushibaike.com/article/d+"
        ],
        'list_url_regexes' => [
            "http://www.qiushibaike.com/8hr/page/d+?s=d+"
        ],
        'fields' => [
            [
                // 抽取内容页的文章内容
                'name' => "article_content",
                'selector' => "//*[@id='single-next-link']",
                'required' => true
            ],
            [
                // 抽取内容页的文章作者
                'name' => "article_author",
                'selector' => "//div[contains(@class,'author')]//h2",
                'required' => true
            ],
        ],
        'log_show' => true,
        'input_encoding' => 'utf-8',
        'output_encoding' => 'utf-8',
        'db_config' => [
            'host' => '127.0.0.1',
            'user' => 'root',
            'pass' => '',
            'name' => 'demo',
            'port' => 3306
        ],
        /*
        'export' => [
            'type' => 'sql',
            'file' => './data/sql/qiushibaike.sql'
        ]
        */
        'export' => [
            'type' => 'db',
            'table' => 'content',
        ]
    ];
    
    $spider = new phpspider($configs);
    $spider->start();
    

      

    * Run

    php ./index.php 
    

      

  • 相关阅读:
    关于并发量的简单计算公式
    kbmmw中向服务器端传递对象的一种简单方式
    tms web core 里面调用pascal 过程。
    tms web core 通过URL 传递参数
    tms web core 与 kbmmw 第一次亲密接触
    kbmmw 的HTTPSmartService 上传文件到服务器端
    kbmmw 中的进程管理小工具
    kbmmw 5.06.20 发布
    kbmmw ORM 对象定义语法简析
    kbmmw 5.06.00 beta 发布
  • 原文地址:https://www.cnblogs.com/mingzhanghui/p/9311283.html
Copyright © 2011-2022 走看看