zoukankan      html  css  js  c++  java
  • ElasticSearch自定义分析器-集成结巴分词插件

    关于结巴分词 ElasticSearch 插件:

    https://github.com/huaban/elasticsearch-analysis-jieba

    该插件由huaban开发。支持Elastic Search 版本<=2.3.5。

    结巴分词分析器

    结巴分词插件提供3个分析器:jieba_index、jieba_search和jieba_other。

    1. jieba_index: 用于索引分词,分词粒度较细;
    2. jieba_search: 用于查询分词,分词粒度较粗;
    3. jieba_other: 全角转半角、大写转小写、字符分词;

    使用jieba_index或jieba_search分析器,可以实现基本的分词效果。

    以下是最小配置示例:

    {
        "mappings": {
            "test": {
                "_all": {
                    "enabled": false
                },
                "properties": {
                    "name": {
                        "type": "string",
                        "analyzer": "jieba_index",
                        "search_analyzer": "jieba_index"
                    }
                }
            }
        }
    }

    在生产化境中,因为业务的需要,需要考虑实现以下功能:

    1. 支持同义词;
    2. 支持字符过滤器;

    结巴插件提供的分析器jieba_index、jieba_search无法实现以上功能。

    自定义分析器

    当jieba_index、jieba_search分析器不满足生成环境的需求时,我们可以使用自定义分析器来解决以上问题。

    分析器是由字符过滤器,分词器,词元过滤器组成的。

    一个分词器允许包含多个字符过滤器+一个分词器+多个词元过滤器。

    因业务的需求,我们需要使用映射字符过滤器来实现分词前某些字符串的替换操作。如将用户输入的c#替换为csharp,c++替换为cplus。

    下面逐一介绍分析器各个组成部分。

    1. 映射字符过滤器Mapping Char Filter

    这个是Elastic Search内置的映射字符过滤器,位于settings –> analysis -> char_filter下:

    PUT /my_index
    {
        "settings": {
            "analysis": {
                "char_filter": {
                    "mapping_filter": {
                        "type": "mapping",
                        "mappings": [
                          "c# => csharp",
                          "c++ => cplus"
                      ]
                    }
                }
            }
        }
    }

    也可以通过文件载入字符映射表。

    PUT /my_index
    {
        "settings": {
            "analysis": {
                "char_filter": {
                    "mapping_filter": {
                        "type": "mapping",
                        "mappings_path": "mappings.txt"
                    }
                }
            }
        }
    }

    文件默认存放config目录下,即config/ mappings.txt。

    2. 结巴分词词元过滤器JiebaTokenFilter

    JiebaTokenFilter接受一个SegMode参数,该参数有两个可选值:Index和Search。

    我们预先定义两个词元过滤器:jieba_index_filter和jieba_search_filter。

    PUT /my_index
    {
        "settings": {
            "analysis": {
                "filter": {
                    "jieba_index_filter": {
                        "type": "jieba",
                        "seg_mode": "index"
                    },
                    "jieba_search_filter": {
                        "type": "jieba",
                        "seg_mode": "search"
                    }
                }
            }
        }
    }

     这两个词元过滤器将分别用于索引分析器和查询分析器。

    3. stop 停用词词元过滤器

    因分词词元过滤器JiebaTokenFilter并不处理停用词。因此我们在自定义分析器时,需要定义停用词词元过滤器来处理停用词。 

    Elastic Search提供了停用词词元过滤器,我们可以这样来定义:

    PUT /my_index
    {
        "settings": {
            "analysis": {
                "filter": {
                    "stop_filter": {
                        "type":       "stop",
                        "stopwords": ["and", "is", "the"]
                    }
                }
            }
        }
    }

    也可以通过文件载入停用词列表 

    PUT /my_index
    {
        "settings": {
            "analysis": {
                "filter": {
                    "stop_filter": {
                        "type": "stop",
                        "stopwords_path": "stopwords.txt"
                    }
                }
            }
        }
    }

    文件默认存放config目录下,即config/ stopwords.txt。

    4. synonym 同义词词元过滤器

    我们使用ElasticSearch内置同义词词元过滤器来实现同义词的功能。

    PUT /my_index
    {
        "settings": {
            "analysis": {
                "filter": {
                    "synonym_filter": {
                        "type": "synonym",
                        "stopwords": [
                          "中文,汉语,汉字"
                      ]
                    }
                }
            }
        }
    }

    如果同义词量比较大时,推荐使用文件的方式载入同义词库。

    PUT /my_index
    {
        "settings": {
            "analysis": {
                "filter": {
                    "synonym_filter ": {
                        "type": "synonym",
                        "stopwords_path": "synonyms.txt"
                    }
                }
            }
        }
    }

    5. 重新定义分析器jieba_index和jieba_search

    Elastic Search支持多级分词,我们使用whitespace分词作为分词器;并在词元过滤器加入定义好的Jiebie分词词元过滤器:jieba_index_filter和jieba_search_filter。 

    PUT /my_index
    {
        "settings": {
            "analysis": {
                "analyzer": {
                    "jieba_index": {
                        "char_filter": [
                          "mapping_filter"
                      ],
                        "tokenizer": "whitespace",
                        "filter": [
                          "jieba_index_filter",
                          "stop_filter",
                          "synonym_filter"
                      ]
                    },
                    "jieba_search": {
                        "char_filter": [
                          "mapping_filter"
                      ],
                        "tokenizer": "whitespace",
                        "filter": [
                          "jieba_search_filter",
                          "stop_filter",
                          "synonym_filter"
                      ]
                    }
                }
            }
        }
    }

    注意,上面分析器的命名依然使用jieba_index和jieba_search,以便覆盖结巴分词插件提供的分析器。

    当存在多个同名的分析器时,Elastic Search会优先使用索引配置中定义的分析器。

    这样在代码调用层面便无需再更改。 

    下面是完整的配置:

    PUT /my_index
    {
        "settings": {
            "analysis": {
                "char_filter": {
                    "mapping_filter": {
                        "type": "mapping",
                      "mappings_path": "mappings.txt"
                    }
                }
                "filter": {
                    "synonym_filter ": {
                        "type": "synonym",
                        "stopwords_path": "synonyms.txt"
                    },
                    "stop_filter": {
                        "type": "stop",
                        "stopwords_path": "stopwords.txt"
                    },
                    "jieba_index_filter": {
                        "type": "jieba",
                        "seg_mode": "index"
                    },
                    "jieba_search_filter": {
                        "type": "jieba",
                        "seg_mode": "search"
                    }
                }
                "analyzer": {
                    "jieba_index": {
                        "char_filter": [
                          "mapping_filter"
                      ],
                        "tokenizer": "whitespace",
                        "filter": [
                          "jieba_index_filter",
                          "stop_filter",
                          "synonym_filter"
                      ]
                    },
                    "jieba_search": {
                        "char_filter": [
                          "mapping_filter"
                      ],
                        "tokenizer": "whitespace",
                        "filter": [
                          "jieba_search_filter",
                          "stop_filter",
                          "synonym_filter"
                      ]
                    }
                }
            }
        }
    }

     参考资料:

    https://www.elastic.co/guide/en/elasticsearch/reference/2.3/index.html

    http://www.tuicool.com/articles/eUJJ3qF

  • 相关阅读:
    快速排序就这么简单
    Shiro入门这篇就够了【Shiro的基础知识、回顾URL拦截】
    SpringDataJPA入门就这么简单
    递归就这么简单
    SpringBoot就是这么简单
    Activiti就是这么简单
    Lucene就是这么简单
    过来人告诉你,去工作前最好还是学学Git
    给女朋友讲解什么是Git
    我终于看懂了HBase,太不容易了...
  • 原文地址:https://www.cnblogs.com/dengzhizhong/p/6373333.html
Copyright © 2011-2022 走看看