zoukankan      html  css  js  c++  java
  • sphinx安装

    相关命令及步骤
        创建主索引:
            /usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/csft.conf --all
        创建增量索引:
            1. 创建测试数据表以及数据
            2. 修改配置文件
                主索引源:sql_query_pre
                增量索引源:sql_query_pre  sql_query  sql_query_post
                主索引:source path
                增量索引:source path
            3. 创建/更新主索引
            4. 创建/更新增量索引
            /usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/csft.conf delta
        重启索引进程
            /usr/local/coreseek/bin/searchd --stop
            /usr/local/coreseek/bin/searchd -c /usr/local/coreseek/etc/csft.conf
        索引合并
            /usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/csft.conf --merge main delta --rotate


    csft.conf配置文件
        source src1
        {
            type                    = mysql
            sql_host                = 127.0.0.1
            sql_user                = root
            sql_pass                =
            sql_db                  = test
            sql_port                = 3306  # optional, default is 3306

            sql_query_pre           = SET NAMES utf8
            sql_query_pre           = REPLACE INTO sph_counter SELECT 1, MAX(id) FROM documents

            sql_query               =
                SELECT id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content
                FROM documents

            sql_attr_uint           = group_id

            sql_attr_timestamp      = date_added

            sql_ranged_throttle = 0

            sql_query_info      = SELECT * FROM documents WHERE id=$id

        }

        index test1
        {
            source          = src1

            path            = /usr/local/coreseek/var/data/test1

            docinfo         = extern

            mlock           = 0

            morphology      = none

            stopwords           = /usr/local/coreseek/var/data/test1/stopwords.txt

            wordforms           = /usr/local/coreseek/var/data/test1/wordforms.txt

            min_word_len        = 1

            charset_type        = sbcs

            html_strip              = 0

        }

        source delta : src1
        {
            sql_query_pre = SET NAMES utf8
            sql_query = SELECT
                            id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content
                        FROM documents
                        WHERE
                            id>( SELECT max_doc_id FROM sph_counter WHERE counter_id=1 )
            sql_query_post = UPDATE sph_counter SET max_doc_id=(SELECT MAX(id) FROM documents) where counter_id=1
        }

        index delta : test1
        {
            source = delta
            path = /usr/local/coreseek/var/data/test1
        }


    创建mysql测试数据表及数据
        CREATE TABLE `documents` (`id` int(11) NOT NULL auto_increment,`group_id` int(11) NOT NULL,`group_id2` int(11) NOT NULL,`date_added` datetime NOT NULL,`title` varchar(255) NOT NULL,`content` text NOT NULL,PRIMARY KEY (`id`)) ENGINE=InnoDB AUTO_INCREMENT=5;

        INSERT INTO `documents` VALUES ('1', '1', '5', '2008-09-13 21:37:47', 'test one', 'this is my test document number one. also checking search within phrases.');INSERT INTO `documents` VALUES ('2', '1', '6', '2008-09-13 21:37:47', 'test two', 'this is my test document number two');INSERT INTO `documents` VALUES ('3', '2', '7', '2008-09-13 21:37:47', 'another doc', 'this is another group');INSERT INTO `documents` VALUES ('4', '2', '8', '2008-09-13 21:37:47', 'doc number four', 'this is to test groups');

        // 实现增量索引时使用的计数表
        CREATE TABLE sph_counter( counter_id INTEGER PRIMARY KEY NOT NULL, max_doc_id INTEGER NOT NULL);



    PHP使用

        <?php

        header("Content-type: text/html; charset=utf-8");

        require_once('sphinxapi.php');

        $s = new SphinxClient();

        $s->setServer("127.0.0.1", 9312);
        $s->setArrayResult(true);
        $s->setMatchMode(SPH_MATCH_ALL);

        $keyword = 'test';

        $result = $s->Query($keyword, '*');
        if ($result['total'] == 0) {
            echo '无搜索结果';die;
        }

        // 获取结果id集
        $ids = array();
        foreach($result['matches'] as $key => $val)
        {
            $ids[] = $val['id'];
        }
        print_r($ids);

        // 连接数据库
        $dsn = "mysql:host=localhost;dbname=test;charset=utf8";
        $db = new PDO($dsn, 'root', '');

        $sql = 'select * from documents where id in('.implode(',', $ids).')';
        $result = $db->query($sql);
        $result->setFetchMode(PDO::FETCH_ASSOC);

        $data = $result->fetchAll();

        // 搜索结果高亮显示
        $rule = array(
                    "before_match" => "<font style='font-weight:bold;color:#f00'>",
                    "after_match" => "</font>"
                );
        foreach ($data as $key=>$val) {
            $data[$key] = $s->BuildExcerpts($val, 'delta', $keyword, $rule);
        }

        print_r($data);



    添加新分词
        1. 复制unigram.txt文件为unigram_new.txt
        2. 在unigram_new.txt中添加新词
        3. 生成新的词典文件:/usr/local/mmseg3/bin/mmseg -u /usr/local/mmseg3/etc/unigram_new.txt
        4. 替换原有的uni.lib文件
        5. 重建索引 && 重启索引

  • 相关阅读:
    算法学习【第2篇】:列表查找以及二分查找
    算法学习【第1篇】:算法之基础
    九、爬虫框架之Scrapy
    八、asynicio模块以及爬虫应用asynicio模块(高性能爬虫)
    第七篇:爬虫实战— 4、爬取校花网视频示例(点开往下拉)
    第七篇:爬虫实战— 3、自动登录123并且自动发送邮箱;自动爬取京东商品信息
    第七篇:爬虫实战—2、投递拉钩网简历
    第七篇:爬虫实战--- 1、破解滑动验证码
    Ubuntu安装JDK与环境变量配置
    显示 Ubuntu 11.10 的 终端窗口
  • 原文地址:https://www.cnblogs.com/haoyu521/p/5607780.html
Copyright © 2011-2022 走看看