zoukankan      html  css  js  c++  java
  • Elasticsearch高级搜索排序( 中文+拼音+首字母+简繁转换+特殊符号过滤)

    一、先摆需求:

    1、中文搜索、英文搜索、中英混搜   如:“南京东路”,“cafe 南京东路店”

    2、全拼搜索、首字母搜索、中文+全拼、中文+首字母混搜   如:“nanjingdonglu”,“njdl”,“南京donglu”,“南京dl”,“nang南东路”,“njd路”等等组合

    3、简繁搜索、特殊符号过滤搜索   如:“龍馬”可通过“龙马”搜索,再比如 L.G.F可以通过lgf搜索,café可能通过cafe搜索

    4、排序优先级为: 以关键字开头>包含关键字

    二、生产效果图:

    三、实现

    1、索引设计

    使用multi_field为搜索字段建立不同类型的索引,有全拼索引、首字母简写索引、Ngram索引以及IK索引,从各个角度分别击破,然后通过char-filter进行特殊符号与简繁转换。

    curl -XPUT localhost:9200/search_words_index -d '{
       "settings" : {
          "refresh_interval" : "5s",
          "number_of_shards" : 1,
          "number_of_replicas" : 1,
          "analysis" : {
                 "filter": {
                    "edge_ngram_filter": { 
                        "type":     "edge_ngram",
                        "min_gram": 1,
                        "max_gram": 50                    
                    },                
                    "pinyin_simple_filter":{
                        "type" : "pinyin",
                        "keep_first_letter":true,
                        "keep_separate_first_letter" : false,
                        "keep_full_pinyin" : false,
                        "keep_original" : false,
                        "limit_first_letter_length" : 50,
                        "lowercase" : true
                    },
                    "pinyin_full_filter":{
                        "type" : "pinyin",
                        "keep_first_letter":false,
                        "keep_separate_first_letter" : false,
                        "keep_full_pinyin" : true,                        
                        "none_chinese_pinyin_tokenize":true,
                        "keep_original" : false,
                        "limit_first_letter_length" : 50,
                        "lowercase" : true
                    },
                    "t2s_convert":{
                          "type": "stconvert",
                          "delimiter": ",",
                          "convert_type": "t2s"
                   }
                },
                "char_filter" : {
                    "charconvert" : {
                        "type" : "mapping",
                        "mappings_path":"char_filter_text.txt"
                    }
                },    
                "tokenizer":{
                    "ik_smart":{
                       "type":"ik",
                       "use_smart":true                
                    }
                },
                "analyzer": {
                    "ngramIndexAnalyzer": {
                        "type": "custom",
                        "tokenizer": "keyword",
                        "filter": ["edge_ngram_filter","lowercase"],
                        "char_filter" : ["charconvert"]
                    },
                    "ngramSearchAnalyzer": {
                        "type": "custom",
                        "tokenizer": "keyword",   
                        "filter":["lowercase"],
                        "char_filter" : ["charconvert"]
                    },    
                    "ikIndexAnalyzer": {
                        "type": "custom",
                        "tokenizer": "ik",                   
                        "char_filter" : ["charconvert"]
                    },
                    "ikSearchAnalyzer": {
                        "type": "custom",
                        "tokenizer": "ik",                       
                        "char_filter" : ["charconvert"]
                    },                    
                    "pinyiSimpleIndexAnalyzer":{                   
                        "tokenizer" : "keyword",
                        "filter": ["pinyin_simple_filter","edge_ngram_filter","lowercase"]                                    
                    },                
                    "pinyiSimpleSearchAnalyzer":{
                        "tokenizer" : "keyword",     
                        "filter": ["pinyin_simple_filter","lowercase"]    
                    },
                    "pinyiFullIndexAnalyzer":{                   
                        "tokenizer" : "keyword",
                        "filter": ["pinyin_full_filter","lowercase"]                                    
                    },                
                    "pinyiFullSearchAnalyzer":{
                        "tokenizer" : "keyword",     
                        "filter": ["pinyin_full_filter","lowercase"]    
                    }
                }
           }
        },
        "mappings": {       
            "search_words_type": {        
                "properties": {   
                    "words": {
                        "type": "multi_field",                    
                        "fields":{
                              "words": {
                                    "type": "string",
                                    "index": "analyzed",
                                    "indexAnalyzer" : "ngramIndexAnalyzer"
                              },
                              "SPY": {
                                    "type": "string",
                                    "index": "analyzed",
                                    "indexAnalyzer" : "pinyiSimpleIndexAnalyzer"
                              },
                              "FPY": {
                                    "type": "string",
                                    "index": "analyzed",
                                    "indexAnalyzer" : "pinyiFullIndexAnalyzer"
                              },
                              "IKS": {
                                    "type": "string",
                                    "index": "analyzed",
                                    "indexAnalyzer" : "ikIndexAnalyzer"
                              }
                        }
                    }
                }
            }
        }
    }'

    拼音插件的使用请参考:https://github.com/medcl/elasticsearch-analysis-pinyin

    2、搜索构建

    以下是搜索实现代码(非完整代码,只摘录核心部分,主要是思路):

      /**
         * 纯中文搜索
         * @return
         */
        public List<Map> chineseSearch(String key,Integer cityId) throws Exception{
            DisMaxQueryBuilder  disMaxQueryBuilder=QueryBuilders.disMaxQuery();
            //以关键字开头(优先级最高)
            MatchQueryBuilder q1=QueryBuilders.matchQuery("words",key).analyzer("ngramSearchAnalyzer").boost(5);        
            //完整包含经过分析过的关键字
    //         boolean  whitespace=key.contains(" ");
    //         int slop=whitespace?50:5;
            QueryBuilder q2=QueryBuilders.matchQuery("words.IKS", key).analyzer("ikSearchAnalyzer").minimumShouldMatch("100%");
            disMaxQueryBuilder.add(q1);
            disMaxQueryBuilder.add(q2);
            SearchQuery searchQuery=builderQuery(cityId,disMaxQueryBuilder);
            return  elasticsearchTemplate.queryForList(searchQuery,Map.class);
        }
    
    
     /**
         * 混合搜索
         * @return
         */
        public List<Map> chineseWithEnglishOrPinyinSearch(String key,Integer cityId) throws Exception{
                
            DisMaxQueryBuilder  disMaxQueryBuilder=QueryBuilders.disMaxQuery();
            //是否有中文开头,有则返回中文前缀
            String startChineseString=commonSearchService.getStartChineseString(key);        
            /**
             * 源值搜索,不做拼音转换    
             * 权重* 1.5
             */        
            QueryBuilder normSearchBuilder=QueryBuilders.matchQuery("words",key).analyzer("ngramSearchAnalyzer").boost(5f);        
            
            /**
             * 拼音简写搜索
             * 1、分析key,转换为简写  case:  南京东路==>njdl,南京dl==>njdl,njdl==>njdl
             * 2、搜索匹配,必须完整匹配简写词干
             * 3、如果有中文前缀,则排序优先
             * 权重*1
             */
            String analysisKey=commonSearchService.anaysisKeyAndGetMaxWords(SearchIndex.INDEX_NAME_SEARCHWORDSSTATISTICS,key,"pinyiSimpleSearchAnalyzer");
            QueryBuilder pingYinSampleQueryBuilder=QueryBuilders.termQuery("words.SPY", analysisKey);
            
            /**
             * 拼音简写包含匹配,如 njdl可以查出 "城市公牛 南京东路店",虽然非南京东路开头
             * 权重*0.8
             */
            QueryBuilder  pingYinSampleContainQueryBuilder=null;
            if(analysisKey.length()>1){
                pingYinSampleContainQueryBuilder=QueryBuilders.wildcardQuery("words.SPY", "*"+analysisKey+"*").boost(0.8f);
            }
    
            /**
             * 拼音全拼搜索
             * 1、分析key,获取拼音词干   case :  南京东路==>[nan,jing,dong,lu],南京donglu==>[nan,jing,dong,lu]
             * 2、搜索查询,必须匹配所有拼音词,如南京东路,则nan,jing,dong,lu四个词干必须完全匹配
             * 3、如果有中文前缀,则排序优先  
             * 权重*1
             */
            QueryBuilder pingYinFullQueryBuilder=null;
            if(key.length()>1){
                pingYinFullQueryBuilder=QueryBuilders.matchPhraseQuery("words.FPY", key).analyzer("pinyiFullSearchAnalyzer");    
            }
    
            /**
             * 完整包含关键字查询(优先级最低,只有以上四种方式查询无结果时才考虑)
             * 权重*0.8
             */
            QueryBuilder containSearchBuilder=QueryBuilders.matchQuery("words.IKS", key).analyzer("ikSearchAnalyzer").minimumShouldMatch("100%");
                    
            disMaxQueryBuilder
            .add(normSearchBuilder)
            .add(pingYinSampleQueryBuilder)    
            .add(containSearchBuilder);
            
            //以下两个对性能有一定的影响,故作此判定,单个字符不执行此类搜索
            if(pingYinFullQueryBuilder!=null){
                disMaxQueryBuilder.add(pingYinFullQueryBuilder);
            }
            if(pingYinSampleContainQueryBuilder!=null){
                disMaxQueryBuilder.add(pingYinSampleContainQueryBuilder);
            }        
            
            QueryBuilder queryBuilder=disMaxQueryBuilder;
            
            //关键如果有中文,则必须包含在内容中
            if(StringUtils.isNotBlank(startChineseString)){
                queryBuilder=    QueryBuilders.filteredQuery(disMaxQueryBuilder,
                        FilterBuilders.queryFilter(QueryBuilders.queryStringQuery("*"+startChineseString+"*").field("words").analyzer("ngramSearchAnalyzer")));
                queryBuilder=QueryBuilders.functionScoreQuery(queryBuilder)
                .add(FilterBuilders.queryFilter(QueryBuilders.matchQuery("words",startChineseString).analyzer("ngramSearchAnalyzer")), ScoreFunctionBuilders.weightFactorFunction(1.5f));
            }                
        
            SearchQuery searchQuery=builderQuery(cityId,queryBuilder);
            
            return  elasticsearchTemplate.queryForList(searchQuery,Map.class);
        }    

    注:以上JAVA示例代码皆以spring-data-elasticsearch框架为基础。

    拼音插件安装:

    1、下载拼音插件,官网地址:https://github.com/medcl/elasticsearch-analysis-pinyin  我下载的版本是:elasticsearch-analysis-pinyin-1.3.3。

    把下载的 elasticsearch-analysis-pinyin-1.3.3.jar与nlp-lang-1.7.jar放于plugins目录下。

    2、修改elasticsearch配置文件,在最后一行之下加入(里面包括IK配置,如果未安装IK可省略IK的配置):

    index:
    analysis:
    analyzer:
    ik:
    alias: [news_analyzer_ik,ik_analyzer]
    type: org.elasticsearch.index.analysis.IkAnalyzerProvider
    ik_max_word:
    type: ik
    use_smart: false
    ik_smart:
    type: ik
    use_smart: true
    pinyin:
    tokenizer: pinyin_tokenizer
    filter: [standard,nGram]
    tokenizer:
    pinyin_tokenizer:
    type: pinyin
    first_letter: "prefix"
    padding_char: ""

    3、定制特殊符号及简繁转换文本:char_filter_text.txt,由于文件有点长,以下是部分内容,参考格式即可。

    à=>a
    á=>a
    â=>a
    ä=>a
    À=>a
    Â=>a
    Ä=>a
    è=>e
    é=>e
    ê=>e
    ë=>e
    È=>e
    É=>e
    Ê=>e
    Ë=>e
    î=>i
    ï=>i
    Î=>i
    Ï=>i
    ô=>o
    ö=>o
    Ô=>o
    Ö=>o
    ù=>u
    û=>u
    ü=>u
    Ù=>u
    Û=>u
    Ü=>u
    ç=>c
    œ=>c
    &=>
    ^=>
    .=>
    ·=>
    -=>
    '=>
    ’=>
    ‘=>
    /=>
    醯壶=>醯壶
    屢顧爾僕=>屡顾尔仆
    見=>见
    往裡=>往里
    置言成範=>置言成范
    捲動=>卷动
    規=>规
    齣電視=>出电视
    覎=>觃
    後堂=>后堂

    4、重启elasticsearch,重建索引,看是否生效。

  • 相关阅读:
    linux系统调用之系统控制
    linux系统调用之文件系统操作
    使用EF框架实现MVC的增删改查功能
    MVC+EF快速弄出一个CRUD
    Entity Framework 全面教程详解(转)
    微信小程序学习
    为Bootstrap模态对话框添加拖拽移动功能
    Razor语法大全
    EXCEL怎么打20位以上的数字?
    C# SQLite 数据库操作学习
  • 原文地址:https://www.cnblogs.com/clonen/p/6674888.html
Copyright © 2011-2022 走看看