zoukankan      html  css  js  c++  java
  • elasticsearch ik中文分词器的使用详解

    (基于es5.4)先喵几眼github,按照步骤安装好分词器 link:https://github.com/medcl/elasticsearch-analysis-ik

    复习一下常用的操作

    1.查看集群健康状况
    GET /_cat/health?v&pretty
    
    2.查看my_index的mapping和setting的相关信息
    GET /my_index?pretty
    
    3.查看所有的index
    GET /_cat/indices?v&pretty
    
    4.删除 my_index_new
    DELETE /my_index_new?pretty&pretty

    先测试ik分词器的基本功能

    GET _analyze?pretty
    {
      "analyzer": "ik_smart",
      "text": "中华人民共和国国歌"
    }

    结果:

    {
      "tokens": [
        {
          "token": "中华人民共和国",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 0
        },
        {
          "token": "国歌",
          "start_offset": 7,
          "end_offset": 9,
          "type": "CN_WORD",
          "position": 1
        }
      ]
    }

    可以看出:通过ik_smart明显很智能的将 "中华人民共和国国歌"进行了正确的分词。

    另外一个例子:

    GET _analyze?pretty
    {
      "analyzer": "ik_smart",
      "text": "王者荣耀是最好玩的游戏"
    }

    结果:

    {
      "tokens": [
        {
          "token": "王者荣耀",
          "start_offset": 0,
          "end_offset": 4,
          "type": "CN_WORD",
          "position": 0
        },
        {
          "token": "",
          "start_offset": 5,
          "end_offset": 6,
          "type": "CN_CHAR",
          "position": 1
        },
        {
          "token": "好玩",
          "start_offset": 6,
          "end_offset": 8,
          "type": "CN_WORD",
          "position": 2
        },
        {
          "token": "游戏",
          "start_offset": 9,
          "end_offset": 11,
          "type": "CN_WORD",
          "position": 3
        }
      ]
    }

    如果结果跟我的不一样,那就对了,中文ik分词词库里面将“王者荣耀”是分开的,但是我们又不愿意将其分开,根据github上面的指示可以配置

    IKAnalyzer.cfg.xml 目录在:elasticsearch-5.4.0/plugins/ik/config

    <?xml version="1.0" encoding="UTF-8"?>
    <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
    <properties>
        <comment>IK Analyzer 扩展配置</comment>
        <!--用户可以在这里配置自己的扩展字典 -->
        <entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
        <!--用户可以在这里配置自己的扩展停止词字典-->
        <entry key="ext_stopwords">custom/ext_stopword.dic</entry>
        <!--用户可以在这里配置远程扩展字典,下面是配置在nginx路径下面的 -->
        <entry key="remote_ext_dict">http://tagtic-slave01:82/HotWords.php</entry>
        <!--用户可以在这里配置远程扩展停止词字典-->
        <!-- <entry key="remote_ext_stopwords">words_location</entry> -->
        <entry key="remote_ext_stopwords">http://tagtic-slave01:82/StopWords.php</entry>
    </properties>

    可以看到HotWords.php

    <?php 
    $s = <<<'EOF'
    王者荣耀
    阴阳师
    EOF;
    header("Content-type: text/html; charset=utf-8"); 
    header('Last-Modified: '.gmdate('D, d M Y H:i:s', time()).' GMT', true, 200);
    header('ETag: "5816f349-19"');
    echo $s;
    ?>

    配置完了之后就可以看到刚才的结果了

    顺便测试一下ik_max_word

    GET /index/_analyze?pretty
    {
      "analyzer": "ik_max_word",
      "text": "中华人民共和国国歌"
    }

    结果看看就行了

    {
      "tokens": [
        {
          "token": "中华人民共和国",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 0
        },
        {
          "token": "中华人民",
          "start_offset": 0,
          "end_offset": 4,
          "type": "CN_WORD",
          "position": 1
        },
        {
          "token": "中华",
          "start_offset": 0,
          "end_offset": 2,
          "type": "CN_WORD",
          "position": 2
        },
        {
          "token": "华人",
          "start_offset": 1,
          "end_offset": 3,
          "type": "CN_WORD",
          "position": 3
        },
        {
          "token": "人民共和国",
          "start_offset": 2,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 4
        },
        {
          "token": "人民",
          "start_offset": 2,
          "end_offset": 4,
          "type": "CN_WORD",
          "position": 5
        },
        {
          "token": "共和国",
          "start_offset": 4,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 6
        },
        {
          "token": "共和",
          "start_offset": 4,
          "end_offset": 6,
          "type": "CN_WORD",
          "position": 7
        },
        {
          "token": "",
          "start_offset": 6,
          "end_offset": 7,
          "type": "CN_CHAR",
          "position": 8
        },
        {
          "token": "国歌",
          "start_offset": 7,
          "end_offset": 9,
          "type": "CN_WORD",
          "position": 9
        }
      ]
    }

     再看看github上面的一个例子

    POST /index/fulltext/_mapping
    {
      "fulltext": {
        "_all": {
          "analyzer": "ik_smart"
        },
        "properties": {
          "content": {
            "type": "text"
          }
        }
      }
    }

    存一些值

    POST /index/fulltext/1
    {
      "content": "美国留给伊拉克的是个烂摊子吗"
    }
    
    POST /index/fulltext/2
    {
      "content": "公安部:各地校车将享最高路权"
    }
    
    POST /index/fulltext/3
    {
      "content": "中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"
    }
    
    POST /index/fulltext/4
    {
      "content": "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
    }

    取值

    POST /index/fulltext/_search
    {
      "query": {
        "match": {
          "content": "中国"
        }
      }
    }

    结果

    {
      "took": 3,
      "timed_out": false,
      "_shards": {
        "total": 5,
        "successful": 5,
        "failed": 0
      },
      "hits": {
        "total": 3,
        "max_score": 1.0869478,
        "hits": [
          {
            "_index": "index",
            "_type": "fulltext",
            "_id": "4",
            "_score": 1.0869478,
            "_source": {
              "content": "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
            }
          },
          {
            "_index": "index",
            "_type": "fulltext",
            "_id": "3",
            "_score": 0.61094594,
            "_source": {
              "content": "中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"
            }
          },
          {
            "_index": "index",
            "_type": "fulltext",
            "_id": "1",
            "_score": 0.27179778,
            "_source": {
              "content": "美国留给伊拉克的是个烂摊子吗"
            }
          }
        ]
      }
    }

    es会按照分词进行索引,然后根据你的查询条件按照分数的高低给出结果

    官网有一个例子,可以学习学习:https://github.com/medcl/elasticsearch-analysis-ik


    看另一个有趣的例子

    PUT /index1
    {
      "settings": {
         "refresh_interval": "5s",
         "number_of_shards" :   1, 
         "number_of_replicas" : 0 
      },
      "mappings": {
        "_default_":{
          "_all": { "enabled":  false } 
        },
        "resource": {
          "dynamic": false, 
          "properties": {
            "title": {
              "type": "text",
              "fields": {
                "cn": {
                  "type": "text",
                  "analyzer": "ik_smart"
                },
                "en": {
                  "type": "text",
                  "analyzer": "english"
                }
              }
            }
          }
        }
      }
    }

    field的作用有二:

    1.比如一个string类型可以映射成text类型来进行全文检索,keyword类型作为排序和聚合;
    2 相当于起了个别名,使用不同的分类器

    批量插入值

    POST /_bulk
    { "create": { "_index": "index1", "_type": "resource", "_id": 1 } }
    { "title": "周星驰最新电影" }
    { "create": { "_index": "index1", "_type": "resource", "_id": 2 } }
    { "title": "周星驰最好看的新电影" }
    { "create": { "_index": "index1", "_type": "resource", "_id": 3 } }
    { "title": "周星驰最新电影,最好,新电影" }
    { "create": { "_index": "index1", "_type": "resource", "_id": 4 } }
    { "title": "最最最最好的新新新新电影" }
    { "create": { "_index": "index1", "_type": "resource", "_id": 5 } }
    { "title": "I'm not happy about the foxes" }

    取值

    POST /index1/resource/_search
    {
      "query": {
        "multi_match": {
          "type":     "most_fields", 
          "query":    "fox",
          "fields": "title"
        }
      }
    }

    结果

    {
      "took": 1,
      "timed_out": false,
      "_shards": {
        "total": 1,
        "successful": 1,
        "failed": 0
      },
      "hits": {
        "total": 0,
        "max_score": null,
        "hits": []
      }
    }

    原因,使用title里面查询fox,而title使用的是Standard标准分词器,被索引的是foxes,所以不会有结果,下面这种情况就会有结果了

    POST /index1/resource/_search
    {
      "query": {
        "multi_match": {
          "type":     "most_fields", 
          "query":    "fox",
          "fields": "title.en"
        }
      }
    }

    结果就不列出来了,因为title.en使用的是english分词器

    对比一下下面的输出,体会一下field的使用

    GET /index1/resource/_search
    {
      "query": {
        "match": {
          "title.cn": "the最好游戏"
        }
      }
    }
    
    POST /index1/resource/_search
    {
      "query": {
        "multi_match": {
          "type":     "most_fields", 
          "query":    "the最新游戏",
          "fields": [ "title", "title.cn", "title.en" ]
        }
      }
    }
    
    POST /index1/resource/_search
    {
      "query": {
        "multi_match": {
          "type":     "most_fields", 
          "query":    "the最新",
          "fields": "title.cn"
        }
      }
    }

    根据结果体会体会用法


    下面使用“王者荣耀做测试”,这里可以看到前面配置的HotWords.php是一把双刃剑,将“王者荣耀”放在里面之后,“王者荣耀”这个词就是一个整体,不会被切分成“王者”和“荣耀”,但是就是要搜索王者怎么办呢,这里就体现出fields的强大了,具体看下面

    先存入数据

    POST /_bulk
    { "create": { "_index": "index1", "_type": "resource", "_id": 6 } }
    { "title": "王者荣耀最好玩的游戏" }
    { "create": { "_index": "index1", "_type": "resource", "_id": 7 } }
    { "title": "王者荣耀最好玩的新游戏" }
    { "create": { "_index": "index1", "_type": "resource", "_id": 8 } }
    { "title": "王者荣耀最新游戏,最好玩,新游戏" }
    { "create": { "_index": "index1", "_type": "resource", "_id": 9 } }
    { "title": "最最最最好的新新新新游戏" }
    { "create": { "_index": "index1", "_type": "resource", "_id": 10 } }
    { "title": "I'm not happy about the foxes" }

    查询

    POST /index1/resource/_search
    {
      "query": {
        "multi_match": {
          "type":     "most_fields", 
          "query":    "王者荣耀",
          "fields": "title.cn"
        }
      }
    }
    
    #下面会没有结果返回
    POST /index1/resource/_search
    {
      "query": {
        "multi_match": {
          "type":     "most_fields", 
          "query":    "王者",
          "fields": "title.cn"
        }
      }
    }
    
    POST /index1/resource/_search
    {
      "query": {
        "multi_match": {
          "type":     "most_fields", 
          "query":    "王者",
          "fields": "title"
        }
      }
    }

    对比结果就可以一目了然了,结果略!

    所以一开始业务的需求要相当了解,才能有好的映射(mapping)被设计,搜索的时候也会省事不少

    参考:

    https://github.com/medcl/elasticsearch-analysis-ik

    http://keenwon.com/1404.html

    https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-standard-analyzer.html#_example_output

  • 相关阅读:
    关闭firefox的plugincheck
    C# 三个定时器区别
    数字图像处理学习 01 图像的几何变换
    C++ dll的创建和使用
    使用Log4Cplus+配置文件打印日志
    Bmp图像的数据格式及读取
    GCC的使用和Makefile的编写
    day03 QT学习 常用控件 QLabel QPushButton QLineEdit使用 QSS介绍以及QObject子对象的遍历
    day02 QT学习 字符集和中文乱码的问题
    day01 QT学习 信号槽和QWidget介绍
  • 原文地址:https://www.cnblogs.com/leixingzhi7/p/6903938.html
Copyright © 2011-2022 走看看