zoukankan html css js c++ java

elasticsearch ik中文分词器的使用详解

（基于es5.4）先喵几眼github，按照步骤安装好分词器 link:https://github.com/medcl/elasticsearch-analysis-ik

复习一下常用的操作

1.查看集群健康状况
GET /_cat/health?v&pretty

2.查看my_index的mapping和setting的相关信息
GET /my_index?pretty

3.查看所有的index
GET /_cat/indices?v&pretty

4.删除 my_index_new
DELETE /my_index_new?pretty&pretty

先测试ik分词器的基本功能

GET _analyze?pretty
{
  "analyzer": "ik_smart",
  "text": "中华人民共和国国歌"
}

结果：

{
  "tokens": [
    {
      "token": "中华人民共和国",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "国歌",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 1
    }
  ]
}

可以看出：通过ik_smart明显很智能的将 "中华人民共和国国歌"进行了正确的分词。

另外一个例子：

GET _analyze?pretty
{
  "analyzer": "ik_smart",
  "text": "王者荣耀是最好玩的游戏"
}

结果：

{
  "tokens": [
    {
      "token": "王者荣耀",
      "start_offset": 0,
      "end_offset": 4,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "最",
      "start_offset": 5,
      "end_offset": 6,
      "type": "CN_CHAR",
      "position": 1
    },
    {
      "token": "好玩",
      "start_offset": 6,
      "end_offset": 8,
      "type": "CN_WORD",
      "position": 2
    },
    {
      "token": "游戏",
      "start_offset": 9,
      "end_offset": 11,
      "type": "CN_WORD",
      "position": 3
    }
  ]
}

如果结果跟我的不一样，那就对了，中文ik分词词库里面将“王者荣耀”是分开的，但是我们又不愿意将其分开，根据github上面的指示可以配置

IKAnalyzer.cfg.xml 目录在：elasticsearch-5.4.0/plugins/ik/config

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
    <comment>IK Analyzer 扩展配置</comment>
    <!--用户可以在这里配置自己的扩展字典 -->
    <entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
    <!--用户可以在这里配置自己的扩展停止词字典-->
    <entry key="ext_stopwords">custom/ext_stopword.dic</entry>
    <!--用户可以在这里配置远程扩展字典，下面是配置在nginx路径下面的 -->
    <entry key="remote_ext_dict">http://tagtic-slave01:82/HotWords.php</entry>
    <!--用户可以在这里配置远程扩展停止词字典-->
    <!-- <entry key="remote_ext_stopwords">words_location</entry> -->
    <entry key="remote_ext_stopwords">http://tagtic-slave01:82/StopWords.php</entry>
</properties>

可以看到HotWords.php

<?php 
$s = <<<'EOF'
王者荣耀
阴阳师
EOF;
header("Content-type: text/html; charset=utf-8"); 
header('Last-Modified: '.gmdate('D, d M Y H:i:s', time()).' GMT', true, 200);
header('ETag: "5816f349-19"');
echo $s;
?>

配置完了之后就可以看到刚才的结果了

顺便测试一下ik_max_word

GET /index/_analyze?pretty
{
  "analyzer": "ik_max_word",
  "text": "中华人民共和国国歌"
}

结果看看就行了

{
  "tokens": [
    {
      "token": "中华人民共和国",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "中华人民",
      "start_offset": 0,
      "end_offset": 4,
      "type": "CN_WORD",
      "position": 1
    },
    {
      "token": "中华",
      "start_offset": 0,
      "end_offset": 2,
      "type": "CN_WORD",
      "position": 2
    },
    {
      "token": "华人",
      "start_offset": 1,
      "end_offset": 3,
      "type": "CN_WORD",
      "position": 3
    },
    {
      "token": "人民共和国",
      "start_offset": 2,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 4
    },
    {
      "token": "人民",
      "start_offset": 2,
      "end_offset": 4,
      "type": "CN_WORD",
      "position": 5
    },
    {
      "token": "共和国",
      "start_offset": 4,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 6
    },
    {
      "token": "共和",
      "start_offset": 4,
      "end_offset": 6,
      "type": "CN_WORD",
      "position": 7
    },
    {
      "token": "国",
      "start_offset": 6,
      "end_offset": 7,
      "type": "CN_CHAR",
      "position": 8
    },
    {
      "token": "国歌",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 9
    }
  ]
}

再看看github上面的一个例子

POST /index/fulltext/_mapping
{
  "fulltext": {
    "_all": {
      "analyzer": "ik_smart"
    },
    "properties": {
      "content": {
        "type": "text"
      }
    }
  }
}

存一些值

POST /index/fulltext/1
{
  "content": "美国留给伊拉克的是个烂摊子吗"
}

POST /index/fulltext/2
{
  "content": "公安部：各地校车将享最高路权"
}

POST /index/fulltext/3
{
  "content": "中韩渔警冲突调查：韩警平均每天扣1艘中国渔船"
}

POST /index/fulltext/4
{
  "content": "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
}

取值

POST /index/fulltext/_search
{
  "query": {
    "match": {
      "content": "中国"
    }
  }
}

结果

{
  "took": 3,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 3,
    "max_score": 1.0869478,
    "hits": [
      {
        "_index": "index",
        "_type": "fulltext",
        "_id": "4",
        "_score": 1.0869478,
        "_source": {
          "content": "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
        }
      },
      {
        "_index": "index",
        "_type": "fulltext",
        "_id": "3",
        "_score": 0.61094594,
        "_source": {
          "content": "中韩渔警冲突调查：韩警平均每天扣1艘中国渔船"
        }
      },
      {
        "_index": "index",
        "_type": "fulltext",
        "_id": "1",
        "_score": 0.27179778,
        "_source": {
          "content": "美国留给伊拉克的是个烂摊子吗"
        }
      }
    ]
  }
}

es会按照分词进行索引，然后根据你的查询条件按照分数的高低给出结果

官网有一个例子，可以学习学习：https://github.com/medcl/elasticsearch-analysis-ik

看另一个有趣的例子

PUT /index1
{
  "settings": {
     "refresh_interval": "5s",
     "number_of_shards" :   1, 
     "number_of_replicas" : 0 
  },
  "mappings": {
    "_default_":{
      "_all": { "enabled":  false } 
    },
    "resource": {
      "dynamic": false, 
      "properties": {
        "title": {
          "type": "text",
          "fields": {
            "cn": {
              "type": "text",
              "analyzer": "ik_smart"
            },
            "en": {
              "type": "text",
              "analyzer": "english"
            }
          }
        }
      }
    }
  }
}

field的作用有二：

1.比如一个string类型可以映射成text类型来进行全文检索，keyword类型作为排序和聚合;
2 相当于起了个别名，使用不同的分类器

批量插入值

POST /_bulk
{ "create": { "_index": "index1", "_type": "resource", "_id": 1 } }
{ "title": "周星驰最新电影" }
{ "create": { "_index": "index1", "_type": "resource", "_id": 2 } }
{ "title": "周星驰最好看的新电影" }
{ "create": { "_index": "index1", "_type": "resource", "_id": 3 } }
{ "title": "周星驰最新电影，最好，新电影" }
{ "create": { "_index": "index1", "_type": "resource", "_id": 4 } }
{ "title": "最最最最好的新新新新电影" }
{ "create": { "_index": "index1", "_type": "resource", "_id": 5 } }
{ "title": "I'm not happy about the foxes" }

取值

POST /index1/resource/_search
{
  "query": {
    "multi_match": {
      "type":     "most_fields", 
      "query":    "fox",
      "fields": "title"
    }
  }
}

结果

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "failed": 0
  },
  "hits": {
    "total": 0,
    "max_score": null,
    "hits": []
  }
}

原因，使用title里面查询fox,而title使用的是Standard标准分词器，被索引的是foxes，所以不会有结果，下面这种情况就会有结果了

POST /index1/resource/_search
{
  "query": {
    "multi_match": {
      "type":     "most_fields", 
      "query":    "fox",
      "fields": "title.en"
    }
  }
}

结果就不列出来了，因为title.en使用的是english分词器

对比一下下面的输出，体会一下field的使用

GET /index1/resource/_search
{
  "query": {
    "match": {
      "title.cn": "the最好游戏"
    }
  }
}

POST /index1/resource/_search
{
  "query": {
    "multi_match": {
      "type":     "most_fields", 
      "query":    "the最新游戏",
      "fields": [ "title", "title.cn", "title.en" ]
    }
  }
}

POST /index1/resource/_search
{
  "query": {
    "multi_match": {
      "type":     "most_fields", 
      "query":    "the最新",
      "fields": "title.cn"
    }
  }
}

根据结果体会体会用法

下面使用“王者荣耀做测试”，这里可以看到前面配置的HotWords.php是一把双刃剑，将“王者荣耀”放在里面之后，“王者荣耀”这个词就是一个整体，不会被切分成“王者”和“荣耀”，但是就是要搜索王者怎么办呢，这里就体现出fields的强大了，具体看下面

先存入数据

POST /_bulk
{ "create": { "_index": "index1", "_type": "resource", "_id": 6 } }
{ "title": "王者荣耀最好玩的游戏" }
{ "create": { "_index": "index1", "_type": "resource", "_id": 7 } }
{ "title": "王者荣耀最好玩的新游戏" }
{ "create": { "_index": "index1", "_type": "resource", "_id": 8 } }
{ "title": "王者荣耀最新游戏，最好玩，新游戏" }
{ "create": { "_index": "index1", "_type": "resource", "_id": 9 } }
{ "title": "最最最最好的新新新新游戏" }
{ "create": { "_index": "index1", "_type": "resource", "_id": 10 } }
{ "title": "I'm not happy about the foxes" }

查询

POST /index1/resource/_search
{
  "query": {
    "multi_match": {
      "type":     "most_fields", 
      "query":    "王者荣耀",
      "fields": "title.cn"
    }
  }
}

#下面会没有结果返回
POST /index1/resource/_search
{
  "query": {
    "multi_match": {
      "type":     "most_fields", 
      "query":    "王者",
      "fields": "title.cn"
    }
  }
}

POST /index1/resource/_search
{
  "query": {
    "multi_match": {
      "type":     "most_fields", 
      "query":    "王者",
      "fields": "title"
    }
  }
}

对比结果就可以一目了然了，结果略！

所以一开始业务的需求要相当了解，才能有好的映射（mapping）被设计，搜索的时候也会省事不少

参考：

https://github.com/medcl/elasticsearch-analysis-ik

http://keenwon.com/1404.html

https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-standard-analyzer.html#_example_output

查看全文

相关阅读:
python之数据规范化（Min-Max规范化）
python对全班成绩进行数据清洗（pandas的使用）
python统计全班的成绩（numpy的使用）
python爬虫之动态渲染页面抓取-（Selenium）的使用
 python之小米应用商店搜索
 python之小米应用商店爬虫
 cmds系统数据库源端大表数据更新优化
 临时表空间扩容
 性能优化概要(2)数据库时间,监控和优化工具
 cmds挖掘redolog

原文地址：https://www.cnblogs.com/leixingzhi7/p/6903938.html