zoukankan      html  css  js  c++  java
  • Elasticsearch 中文分词器IK

    1、安装说明

    https://github.com/medcl/elasticsearch-analysis-ik

    2、release版本

    https://github.com/medcl/elasticsearch-analysis-ik/releases

    3、安装插件

    bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.5.1/elasticsearch-analysis-ik-6.1.1.zip
    [es@bigdata-senior01 elasticsearch-6.5.1]$ ll plugins/analysis-ik/
    总用量 1428
    -rw-r--r-- 1 es es 263965 12月 12 10:30 commons-codec-1.9.jar
    -rw-r--r-- 1 es es  61829 12月 12 10:30 commons-logging-1.2.jar
    -rw-r--r-- 1 es es  54693 12月 12 10:30 elasticsearch-analysis-ik-6.5.1.jar
    -rw-r--r-- 1 es es 736658 12月 12 10:30 httpclient-4.5.2.jar
    -rw-r--r-- 1 es es 326724 12月 12 10:30 httpcore-4.4.4.jar
    -rw-r--r-- 1 es es   1805 12月 12 10:30 plugin-descriptor.proper

    也可以自己下载包之后解压缩,copy到plugins下即可
    4、扩展词库

    在es目录下config/analysis-ik/中

    新建自己的词库,utf8编码

    mkdir mydic
    vi myword001.dic
    魔兽世界
    李云龙
    嫦娥

    修改配置文件

    <?xml version="1.0" encoding="UTF-8"?>
    <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
    <properties>
    	<comment>IK Analyzer 扩展配置</comment>
    	<!--用户可以在这里配置自己的扩展字典 -->
    	<entry key="ext_dict">mydic/myword001.dic</entry>
    	 <!--用户可以在这里配置自己的扩展停止词字典-->
    	<entry key="ext_stopwords"></entry>
    	<!--用户可以在这里配置远程扩展字典 -->
    	<!-- <entry key="remote_ext_dict">words_location</entry> -->
    	<!--用户可以在这里配置远程扩展停止词字典-->
    	<!-- <entry key="remote_ext_stopwords">words_location</entry> -->
    </properties>

    官网说明:

    IKAnalyzer.cfg.xml can be located at {conf}/analysis-ik/config/IKAnalyzer.cfg.xml or {plugins}/elasticsearch-analysis-ik-*/config/IKAnalyzer.cfg.xml
    
    <?xml version="1.0" encoding="UTF-8"?>
    <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
    <properties>
    	<comment>IK Analyzer 扩展配置</comment>
    	<!--用户可以在这里配置自己的扩展字典 -->
    	<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
    	 <!--用户可以在这里配置自己的扩展停止词字典-->
    	<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
     	<!--用户可以在这里配置远程扩展字典 -->
    	<entry key="remote_ext_dict">location</entry>
     	<!--用户可以在这里配置远程扩展停止词字典-->
    	<entry key="remote_ext_stopwords">http://xxx.com/xxx.dic</entry>
    </properties>

    测试:

    GET _analyze
    {
      "analyzer": "ik_smart",
      "text": "魔兽世界"
    }
    
    {
      "tokens" : [
        {
          "token" : "魔兽世界",
          "start_offset" : 0,
          "end_offset" : 4,
          "type" : "CN_WORD",
          "position" : 0
        }
      ]
    }
    GET _analyze
    {
      "analyzer": "ik_max_word",
      "text": "魔兽世界"
    }
    
    {
      "tokens" : [
        {
          "token" : "魔兽世界",
          "start_offset" : 0,
          "end_offset" : 4,
          "type" : "CN_WORD",
          "position" : 0
        },
        {
          "token" : "魔兽",
          "start_offset" : 0,
          "end_offset" : 2,
          "type" : "CN_WORD",
          "position" : 1
        },
        {
          "token" : "世界",
          "start_offset" : 2,
          "end_offset" : 4,
          "type" : "CN_WORD",
          "position" : 2
        }
      ]
    }
    ik_smart 是粗粒度分词,分过的词不在参与分词。
    ik_max_word是细粒度分词,根据可能的词进行组合.

    5、使用分词
    5.1直接在settings里设置缺省的分词器
    PUT user
    {
      "settings": {
        "number_of_shards": 2,
        "number_of_replicas": 1,
         "index" : {
                "analysis.analyzer.default.type": "ik_smart"
            }
        }
      }
    }

    PUT bus3
    {
      "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
         "index" : {
            "analysis.analyzer.default.type": "ik_max_word",
            "analysis.search_analyzer.default.type":"ik_smart"
            }
        }
      }
    }

    GET /bus/_settings
    返回:
    {
      "bus3" : {
        "settings" : {
          "index" : {
            "number_of_shards" : "1",
            "provided_name" : "bus3",
            "creation_date" : "1545318988048",
            "analysis" : {
              "analyzer" : {
                "default" : {
                  "type" : "ik_max_word"
                }
              },
              "search_analyzer" : {
                "default" : {
                  "type" : "ik_smart"
                }
              }
            },
            "number_of_replicas" : "0",
            "uuid" : "dOU8yi5pRdi-0Akq_zCWtw",
            "version" : {
              "created" : "6050199"
            }
          }
        }
      }
    }




     5.2 在mapping里对每个字段设置

    PUT bus
    {
      "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
      },
      "mappings": {
        "product":{
           "properties": {
          "name":{
            "type": "text",
            "analyzer": "ik_max_word",
            "search_analyzer": "ik_max_word"
          }
        }
        }
       
      }
    }
    GET bus/_mapping
    
    {
      "bus" : {
        "mappings" : {
          "product" : {
            "properties" : {
              "name" : {
                "type" : "text",
                "analyzer" : "ik_max_word"
              }
            }
          }
        }
      }
    }

    查询测试1:查询使用分词器ik_smart

    GET /bus/_search
    {
      "query": {
        "match": {
          "name": {
            "query": "公交车"
            , "analyzer": "ik_smart"
          }
        }
      },
      "highlight": {
        "fields": {"name": {}}
      }
    }
    
    返回:
    {
      "took" : 3,
      "timed_out" : false,
      "_shards" : {
        "total" : 1,
        "successful" : 1,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : 5,
        "max_score" : 1.8566245,
        "hits" : [
          {
            "_index" : "bus",
            "_type" : "product",
            "_id" : "1",
            "_score" : 1.8566245,
            "_source" : {
              "name" : "公交车1路",
              "desc" : "从东站到西站",
              "price" : 10,
              "producer" : "东部公交",
              "tags" : [
                "普通",
                "单层"
              ],
              "memo" : "a test"
            },
            "highlight" : {
              "name" : [
                "<em>公交车</em>1路"
              ]
            }
          }
        ]
      }
    }

    查询测试2:查询使用分词器ik_max_word

    GET /bus/_search
    {
      "from": 0, "size": 1, 
      "query": {
        "match": {
          "name": {
            "query": "公交车"
            , "analyzer": "ik_max_word"
          }
        }
      },
      "highlight": {
        "fields": {"name": {}}
      }
    }
    返回:
    {
      "took" : 5,
      "timed_out" : false,
      "_shards" : {
        "total" : 1,
        "successful" : 1,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : 5,
        "max_score" : 7.426498,
        "hits" : [
          {
            "_index" : "bus",
            "_type" : "product",
            "_id" : "1",
            "_score" : 7.426498,
            "_source" : {
              "name" : "公交车1路",
              "desc" : "从东站到西站",
              "price" : 10,
              "producer" : "东部公交",
              "tags" : [
                "普通",
                "单层"
              ],
              "memo" : "a test"
            },
            "highlight" : {
              "name" : [
                "<em>公交</em><em>车</em>1路"
              ]
            }
          }
        ]
      }
    }

    可以看到高亮部分是不一样的,一般情况我们可以分词用ik_max_word,查询分词用ik_smart。

    
    
  • 相关阅读:
    MySQL学习笔记7——约束
    MySQL学习笔记8——多表查询
    剑指Offer-4.重建二叉树(C++/Java)
    MySQL学习笔记6——备份与恢复
    MySQL学习笔记5——编码
    剑指Offer-3.从尾到头打印链表(C++/Java)
    codeforce Gym 100500I Hall of Fame (水)
    codeforce Gym 100500F Door Lock (二分)
    code Gym 100500D T-shirts(暴力)
    codeforce Gym 100500C ICPC Giveaways(水)
  • 原文地址:https://www.cnblogs.com/asker009/p/10107809.html
Copyright © 2011-2022 走看看