zoukankan      html  css  js  c++  java
  • ik中文分词器及拼音分词器试用

    安装

    ./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v5.6.4/elasticsearch-analysis-ik-5.6.4.zip
    ./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-pinyin/releases/download/v5.6.4/elasticsearch-analysis-pinyin-5.6.4.zip

    安装后需要重启elasticsearch服务

    查看当前已安装插件

    GET _cat/plugins
    
    结果
    node01 analysis-ik     5.6.4
    node01 analysis-pinyin 5.6.4

    测试中文分词器,支持ik_max_word和ik_smart两种方式

    GET _analyze
    {
      "analyzer":"ik_max_word",
      "text":"中华人民共和国国歌"
    }
    结果
    {
      "tokens": [
        {
          "token": "中华人民共和国",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 0
        },
        {
          "token": "中华人民",
          "start_offset": 0,
          "end_offset": 4,
          "type": "CN_WORD",
          "position": 1
        },
        {
          "token": "中华",
          "start_offset": 0,
          "end_offset": 2,
          "type": "CN_WORD",
          "position": 2
        },
        {
          "token": "华人",
          "start_offset": 1,
          "end_offset": 3,
          "type": "CN_WORD",
          "position": 3
        },
        {
          "token": "人民共和国",
          "start_offset": 2,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 4
        },
        {
          "token": "人民",
          "start_offset": 2,
          "end_offset": 4,
          "type": "CN_WORD",
          "position": 5
        },
        {
          "token": "共和国",
          "start_offset": 4,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 6
        },
        {
          "token": "共和",
          "start_offset": 4,
          "end_offset": 6,
          "type": "CN_WORD",
          "position": 7
        },
        {
          "token": "",
          "start_offset": 6,
          "end_offset": 7,
          "type": "CN_CHAR",
          "position": 8
        },
        {
          "token": "国歌",
          "start_offset": 7,
          "end_offset": 9,
          "type": "CN_WORD",
          "position": 9
        }
      ]
    }
    使用ik_smart,则会尽可能少的返回词语:
    {
      "tokens": [
        {
          "token": "中华人民共和国",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 0
        },
        {
          "token": "国歌",
          "start_offset": 7,
          "end_offset": 9,
          "type": "CN_WORD",
          "position": 1
        }
      ]
    }

    ik分词器支持自定义词库

    vi config/IKAnalyzer.cfg.xml

    <?
    xml version="1.0" encoding="UTF-8"?> <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> <properties> <comment>IK Analyzer 扩展配置</comment> <!--用户可以在这里配置自己的扩展字典 --> <entry key="ext_dict">zhouls.dic</entry> <!--用户可以在这里配置自己的扩展停止词字典--> <entry key="ext_stopwords"></entry> <!--用户可以在这里配置远程扩展字典 --> <!-- <entry key="remote_ext_dict">words_location</entry> --> <!--用户可以在这里配置远程扩展停止词字典--> <!-- <entry key="remote_ext_stopwords">words_location</entry> --> </properties>

    #配置完成需要重启服务

    简单测试拼音分词

    PUT test08
    {
      "index": {
        "analysis": {
          "analyzer": {
            "pinyin_analyzer": {
              "tokenizer": "my_pinyin",
              "filter": "word_delimiter"
            }
          },
          "tokenizer": {
            "my_pinyin": {
              "type": "pinyin",
              "first_letter": "none",
              "padding_char": " "
            }
          }
        }
      }
    }
    
    GET medcl/_analyze
    {
      "text":"刘德华",
      "analyzer":"pinyin_analyzer"
    }
    结果
    {
      "tokens": [
        {
          "token": "liu",
          "start_offset": 0,
          "end_offset": 1,
          "type": "word",
          "position": 0
        },
        {
          "token": "ldh",
          "start_offset": 0,
          "end_offset": 3,
          "type": "word",
          "position": 0
        },
        {
          "token": "de",
          "start_offset": 1,
          "end_offset": 2,
          "type": "word",
          "position": 1
        },
        {
          "token": "hua",
          "start_offset": 2,
          "end_offset": 3,
          "type": "word",
          "position": 2
        }
      ]
    }

    同时支持中文和拼音的分词器

    PUT test06
    {
      "settings":{
        "number_of_shards":"1",
        "index.refresh_interval":"15s",
        "index":{
          "analysis":{
            "analyzer":{
               "ik_pinyin_analyzer":{
                "type":"custom",
                "tokenizer":"ik_smart",
                "filter":"pinyin_filter"
              }
            },
            "filter":{
              "pinyin_filter":{
                "type":"pinyin",
                "keep_first_letter": false
              }
            }
          }
        }
      },
      "mappings": {
        "doc":{
          "properties": {
            "name":{
              "type": "text",
              "analyzer": "ik_pinyin_analyzer"
            }
          }
        }
      }
    }
    
    POST test06/_analyze
    {
      "analyzer": "ik_pinyin_analyzer",
      "text":"中华人民共和国国歌"
    }
    结果
    {
      "tokens": [
        {
          "token": "zhong",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 0
        },
        {
          "token": "hua",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 1
        },
        {
          "token": "ren",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 2
        },
        {
          "token": "min",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 3
        },
        {
          "token": "gong",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 4
        },
        {
          "token": "he",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 5
        },
        {
          "token": "guo",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 6
        },
        {
          "token": "guo",
          "start_offset": 7,
          "end_offset": 9,
          "type": "CN_WORD",
          "position": 7
        },
        {
          "token": "ge",
          "start_offset": 7,
          "end_offset": 9,
          "type": "CN_WORD",
          "position": 8
        }
      ]
    }

    参考文档:

    https://blog.csdn.net/u013905744/article/details/80935846

    https://www.cnblogs.com/xing901022/p/5910139.html

    https://blog.csdn.net/qq_28018283/article/details/80396937

  • 相关阅读:
    jquery UI dialog详解
    jQuery学习笔记
    jQuery源码分析系列:AJAX
    Jquery UI dialog 详解 (中文)
    敏捷个人不错
    总监几句话胜读几年书
    Quartz.NET 任务调度框架
    半解释器模式,解析文本+*/ ()
    向toolStrip控件中添加 日期 控件
    汇编变量类型
  • 原文地址:https://www.cnblogs.com/libin2015/p/10497647.html
Copyright © 2011-2022 走看看