zoukankan      html  css  js  c++  java
  • elasticsearch2.x ik插件

    先来一个标准分词(standard),配置如下:

    curl -XPUT localhost:9200/local -d '{
    
        "settings" : {
    
            "analysis" : {
    
                "analyzer" : {
    
                    "stem" : {
    
                        "tokenizer" : "standard",
    
                        "filter" : ["standard", "lowercase", "stop", "porter_stem"]
    
                    }
    
                }
    
            }
    
        },
    
        "mappings" : {
    
            "article" : {
    
                "dynamic" : true,
    
                "properties" : {
    
                    "title" : {
    
                        "type" : "string",
    
                        "analyzer" : "stem"
    
                    }
    
                }
    
            }
    
        }
    
    }'

    index:local

    type:article

    default analyzer:stem (filter:小写、停用词等)

    field:title  

    测试:

    # Index Data
    
    curl -XPUT localhost:9200/local/article/1 -d'{"title": "Fight for your life"}'
    
    curl -XPUT localhost:9200/local/article/2 -d'{"title": "Fighting for your life"}'
    
    curl -XPUT localhost:9200/local/article/3 -d'{"title": "My dad fought a dog"}'
    
    curl -XPUT localhost:9200/local/article/4 -d'{"title": "Bruno fights Tyson tomorrow"}'
    
      
    
    # search on the title field, which is stemmed on index and search
    
    curl -XGET localhost:9200/local/_search?q=title:fight
    
      
    
    # searching on _all will not do anystemming, unless also configured on the mapping to be stemmed...
    
    curl -XGET localhost:9200/local/_search?q=fight

    例如:

    Fight for your life

    分词如下:

    {"tokens":[
    
    {"token":"fight","start_offset":1,"end_offset":6,"type":"<ALPHANUM>","position":1},<br>{"token":"your","start_offset":11,"end_offset":15,"type":"<ALPHANUM>","position":3},<br>{"token":"life","start_offset":16,"end_offset":20,"type":"<ALPHANUM>","position":4}
    
    ]}

    部署ik分词器

    在elasticsearch.yml中配置  index.analysis.analyzer.ik.type : "ik"

    delete之前创建的index,重新配置如下:

    curl -XPUT localhost:9200/local -d '{
    
        "settings" : {
    
            "analysis" : {
    
                "analyzer" : {
    
                    "ik" : {
    
                        "tokenizer" : "ik"
    
                    }
    
                }
    
            }
    
        },
    
        "mappings" : {
    
            "article" : {
    
                "dynamic" : true,
    
                "properties" : {
    
                    "title" : {
    
                        "type" : "string",
    
                        "analyzer" : "ik"
    
                    }
    
                }
    
            }
    
        }
    
    }'

    测试:

    curl 'http://localhost:9200/local/_analyze?analyzer=ik&pretty=true' -d'  
    
    {  
    
        "text":"中华人民共和国国歌" 
    
    }  
    
    '  
    
    {
    
      "tokens" : [ {
    
        "token" : "text",
    
        "start_offset" : 12,
    
        "end_offset" : 16,
    
        "type" : "ENGLISH",
    
        "position" : 1
    
      }, {
    
        "token" : "中华人民共和国",
    
        "start_offset" : 19,
    
        "end_offset" : 26,
    
        "type" : "CN_WORD",
    
        "position" : 2
    
      }, {
    
        "token" : "国歌",
    
        "start_offset" : 26,
    
        "end_offset" : 28,
    
        "type" : "CN_WORD",
    
        "position" : 3
    
      } ]
    
    }

    如果我们想返回最细粒度的分词结果,需要在elasticsearch.yml中配置如下:

    index:
    
      analysis:
    
        analyzer:
    
          ik:
    
              alias: [ik_analyzer]
    
              type: org.elasticsearch.index.analysis.IkAnalyzerProvider
    
          ik_smart:
    
              type: ik
    
              use_smart: true
    
          ik_max_word:
    
              type: ik
    
              use_smart: false

    测试:

    curl 'http://localhost:9200/index/_analyze?analyzer=ik_max_word&pretty=true' -d'  
    
    {  
    
        "text":"中华人民共和国国歌" 
    
    }  
    
    '  
    
    {
    
      "tokens" : [ {
    
        "token" : "text",
    
        "start_offset" : 12,
    
        "end_offset" : 16,
    
        "type" : "ENGLISH",
    
        "position" : 1
    
      }, {
    
        "token" : "中华人民共和国",
    
        "start_offset" : 19,
    
        "end_offset" : 26,
    
        "type" : "CN_WORD",
    
        "position" : 2
    
      }, {
    
        "token" : "中华人民",
    
        "start_offset" : 19,
    
        "end_offset" : 23,
    
        "type" : "CN_WORD",
    
        "position" : 3
    
      }, {
    
        "token" : "中华",
    
        "start_offset" : 19,
    
        "end_offset" : 21,
    
        "type" : "CN_WORD",
    
        "position" : 4
    
      }, {
    
        "token" : "华人",
    
        "start_offset" : 20,
    
        "end_offset" : 22,
    
        "type" : "CN_WORD",
    
        "position" : 5
    
      }, {
    
        "token" : "人民共和国",
    
        "start_offset" : 21,
    
        "end_offset" : 26,
    
        "type" : "CN_WORD",
    
        "position" : 6
    
      }, {
    
        "token" : "人民",
    
        "start_offset" : 21,
    
        "end_offset" : 23,
    
        "type" : "CN_WORD",
    
        "position" : 7
    
      }, {
    
        "token" : "共和国",
    
        "start_offset" : 23,
    
        "end_offset" : 26,
    
        "type" : "CN_WORD",
    
        "position" : 8
    
      }, {
    
        "token" : "共和",
    
        "start_offset" : 23,
    
        "end_offset" : 25,
    
        "type" : "CN_WORD",
    
        "position" : 9
    
      }, {
    
        "token" : "",
    
        "start_offset" : 25,
    
        "end_offset" : 26,
    
        "type" : "CN_CHAR",
    
        "position" : 10
    
      }, {
    
        "token" : "国歌",
    
        "start_offset" : 26,
    
        "end_offset" : 28,
    
        "type" : "CN_WORD",
    
        "position" : 11
    
      } ]
    
    }
  • 相关阅读:
    有向无环图描述表达式 (a+b)*((a+b)/a)至少需要的顶点数目
    图的深度优先是 树的先序遍历的推广。 广度优先
    leader failed to send out heartbeat on time; took too long, leader is overloaded likely from slow disk
    优化 并发
    最终一致和强一致性缓存实践
    Go语言逆向初探 原创 半个大西瓜 看雪学院 2021-06-23
    中通快递的云原生改造之路 施尧 InfoQ 2021-06-23
    Stable Minimum Storage Merging by Symmetric Comparisons Pok-Son Kim1? and Arne Kutzner2
    A Method for the Construction of Minimum-Redundancy Codes
    滴滴基于Binlog的采集架构与实践 原创 滴滴技术团队 滴滴技术 2021-06-22
  • 原文地址:https://www.cnblogs.com/jiu0821/p/5625578.html
Copyright © 2011-2022 走看看