zoukankan      html  css  js  c++  java
  • elasticsearch实现中文分词和拼音分词混合查询+CompletionSuggestion

    引言
    之前已经介绍了如何搭建elasticsearch服务端和简单的索引创建,和中文分词的支持。今天我们来说一说如何实现elasticsearch同时实现中文分词和pinyin分词。并且实现类似百度搜索栏的搜索建议的功能。

    混合查询
    实现混合查询有很多方式,这里介绍我认为是一个偷懒的方法,就是为你要拼音搜索的字段提供两个额外的字段,一个是全拼字段,一个是首字母缩写字段。我这里用的是官网的Employee的例子:

    public class Employee implements Serializable {
    
        private String firstName;
        private String lastName;
        private String pinyin;//firstName全拼
        private String header;//firstName首字母缩写
        private int age;
        private String about;
        private List<String> interests;
    
        ....省略getter setter

    接下来为index添加setting和mapping

     XContentBuilder settings = XContentFactory.jsonBuilder();
    
                settings.startObject()
                        .startObject("analysis")
                        .startObject("analyzer")
                        .startObject("ik_analyzer").field("tokenizer","ik_smart")
                        .endObject()
                        .endObject()
                        .endObject().endObject();
    
                CreateIndexRequest createIndexRequest = new CreateIndexRequest(index).settings(settings);
                CreateIndexResponse createIndexResponse = esClient.admin().indices().create(createIndexRequest).get();
                logger.info("Index:{} created,response:{}", index, JSON.toJSON(createIndexResponse));
                XContentBuilder builder = XContentFactory.jsonBuilder();
                builder.startObject()
                        .startObject(type)
                        .startObject("properties")
                        .startObject("firstName").field("type", "string").field("analyzer","ik_smart")
                   /*     .field("search_analyzer","ik_smart").field("preserve_separators",false)
                        .field("preserve_position_increments",false)*/
                        .endObject()
                        .startObject("lastName").field("type", "string").field("analyzer","ik_smart")
                        .endObject()
                        .startObject("pinyin").field("type","string").field("analyzer","pinyin")
                        .startObject()
                        .startObject("header").field("type","string").field("analyzer","pinyin")
                        .startObject("about").field("type", "string").field("analyzer","ik_smart")
                        .endObject()
                        .startObject("interests").field("type", "string").field("analyzer","ik_smart")
                        .endObject()
                        .endObject()
                        .endObject()
                        .endObject();
    
                PutMappingRequest putMappingRequest = new PutMappingRequest(index);
                putMappingRequest.type(type);
                putMappingRequest.source(builder);
                PutMappingResponse putMappingResponse = esClient.admin().indices().putMapping(putMappingRequest).get();
                logger.info("Mapping for `{}.{}` putted, response:{}", index, type, JSON.toJSON(putMappingResponse));
    
                return true;
            } catch (Exception e) {
                logger.error("doCreateIndex", e);
                return false;
            }

    添加几个测试用例,我这里直接用了批量插入索引的方法:

        public Boolean bulkIndex(List<String> jsonList){
    
            if(esIndexTypes.get(index)==null) {
                if(getMapping(index, indexType)) esIndexTypes.put(index,true);
            }
    
            BulkRequestBuilder bulkBuilder= esClient.prepareBulk();
            for (String s : jsonList) {
                IndexRequestBuilder requestBuilder = esClient.prepareIndex(index, indexType)
                        .setSource(s);
               bulkBuilder.add(requestBuilder);
            }
    
            BulkResponse bulkResponse = bulkBuilder.execute().actionGet();
            logger.info("index:{} bulk request,:response:{}",index,JSON.toJSON(bulkResponse));
            return true;
        }
    
        @org.junit.Test
        public void test(){
            List<String> list1 = new ArrayList<>(10000);
            for (int i=0;i<10000;i++) {
                Employee employee = new Employee();
                employee.setFirstName("告白气球"+i);
                employee.setPinyin("gaobaiqiqiu"+i);
                employee.setHeader("gbqq");
                employee.setLastName("周杰伦,日记");
                employee.setAbout("呜啦啦啦火车笛
    " +
                        "
    " +
                        "随着奔腾的马蹄
    " +
                        "
    " +
                        "小妹妹吹着口琴
    " +
                        "
    " +
                        "夕阳下美了剪影
    " +
                        "
    " +
                        "我用子弹写日记,我泡妞看电影");
                employee.setAge(18);
                List<String> list = new ArrayList<String>();
                list.add("喜欢打篮球");
                list.add("在大晴天晒太阳");
                list.add("泡妞看电影");
                employee.setInterests(list);
                list1.add(JSON.toJSONString(employee));
            }
    
            boolean index = esProxy.bulkIndex(list1);
    
    
        }

    最后直接搜gaobaiqiqiugbqq搜出来的数据像这样:

    [{"firstName":"告白气球","lastName":"周杰伦,日记","pinyin":"gaobaiqiqiu","about":"呜啦啦啦火车笛
    
    随着奔腾的马蹄
    
    小妹妹吹着口琴
    
    夕阳下美了剪影
    
    我用子弹写日记,我泡妞看电影","header":"gbqq","interests":["喜欢打篮球","在大晴天晒太阳","泡妞看电影"],"age":18}]

    如果直接搜告白搜出来的数据像这样:

    [{"firstName":"<span style="color:red">告白</span>气球","lastName":"周杰伦,日记","pinyin":"gaobaiqiqiu","about":"呜啦啦啦火车笛
    
    随着奔腾的马蹄
    
    小妹妹吹着口琴
    
    夕阳下美了剪影
    
    我用子弹写日记,我泡妞看电影","header":"gbqq","interests":["喜欢打篮球","在大晴天晒太阳","泡妞看电影"],"age":18}]

    CompletionSuggestion查询建议

    使用CompletionSuggestion时mapping需要改一下,实时推荐的字段type需要使用completion。

     XContentBuilder builder = XContentFactory.jsonBuilder();
                builder.startObject()
                        .startObject(type)
                        .startObject("properties")
                        .startObject("firstName").field("type", "completion").field("analyzer","ik_smart")
                      .field("search_analyzer","ik_smart").field("preserve_separators",false)
                        .field("preserve_position_increments",false)
                        .endObject()
                        .startObject("lastName").field("type", "string").field("analyzer","ik_smart")
                        .endObject()
                        .startObject("pinyin").field("type","string").field("analyzer","pinyin")
                        .startObject()
                        .startObject("header").field("type","string").field("analyzer","pinyin")
                        .startObject("about").field("type", "string").field("analyzer","ik_smart")
                        .endObject()
                        .startObject("interests").field("type", "string").field("analyzer","ik_smart")
                        .endObject()
                        .endObject()
                        .endObject()
                        .endObject();

    查询的时候需要使用CompletionSuggestionBuilder.

    public void searchSuggest(String str){
    
            CompletionSuggestionBuilder suggestionBuilder = new CompletionSuggestionBuilder("firstName");
            suggestionBuilder.analyzer("ik_smart");
            suggestionBuilder.text(str);
            SearchResponse response = esClient.prepareSearch(index).setTypes(indexType).setQuery(QueryBuilders.matchAllQuery())
                    .suggest(new SuggestBuilder().addSuggestion("my-suggest-1",suggestionBuilder)).get();
    
            Suggest suggest= response.getSuggest();
            CompletionSuggestion suggestion = suggest.getSuggestion("my-suggest-1");
            List<CompletionSuggestion.Entry> list = suggestion.getEntries();
            for (int i = 0; i < list.size(); i++) {
                List<CompletionSuggestion.Entry.Option> options = list.get(i).getOptions();
                for (int j = 0; j < options.size(); j++) {
                    if (options.get(j) instanceof CompletionSuggestion.Entry.Option) {
                        CompletionSuggestion.Entry.Option op =  options.get(j);
                        System.out.println(op.getScore()+"--"+op.getText());
                    }
                }
            }
        }

    你也可以使用restAPI:http://192.168.10.xxx:9200/megacorp/_search?pretty这里megacorp是indexName,

    { "size": 0,
      "suggest": {
        "my-suggest-1": {
          "prefix": "someone li",
          "completion": {
            "field": "firstName"
          }
        }
      }
    }

    查询出来的结果:

    {
        "took": 12,
        "timed_out": false,
        "_shards": {
            "total": 5,
            "successful": 5,
            "failed": 0
        },
        "hits": {
            "total": 0,
            "max_score": 0,
            "hits": []
        },
        "suggest": {
            "blog-suggest": [
                {
                    "text": "someone li",
                    "offset": 0,
                    "length": 10,
                    "options": [
                        {
                            "text": "someone like you",
                            "_index": "megacorp",
                            "_type": "employee",
                            "_id": "AV_doqcXKY206Vs3lcCO",
                            "_score": 1,
                            "_source": {
                                "about": "呜啦啦啦火车笛
    
    随着奔腾的马蹄
    
    小妹妹吹着口琴
    
    夕阳下美了剪影
    
    我用子弹写日记,我泡妞看电影",
                                "age": 18,
                                "firstName": "someone like you",
                                "interests": [ "喜欢打篮球", "在大晴天晒太阳", "泡妞看电影" ],
                                "lastName": "周杰伦,日记" }
                        }
                    ]
                }
            ]
        }
    }


    ————————————————
    版权声明:本文为CSDN博主「lance的java小菜」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
    原文链接:https://blog.csdn.net/nethackatschool/article/details/78594843

  • 相关阅读:
    使用接口测试活动的中奖概率(随机事件测试)
    关于测试用例冗余的一些思考
    正则表达式经典实例
    自动化测试断言Assent的使用
    equals与==区别
    Git的使用以及GitHub
    django的配置文件字符串是怎么导入的?
    支付宝支付相关开发
    Django的contenttypes
    推荐课程及用户登录
  • 原文地址:https://www.cnblogs.com/it-deepinmind/p/14537710.html
Copyright © 2011-2022 走看看