zoukankan      html  css  js  c++  java
  • elasticsearch支持大table格式数据的搜索

    一、问题源起

    数据情况

    TableMeta, 保存table的元数据,通过fileId关联具体的GridFS文件;

    id name creator fileId
    1 table1 mango f1
    2 table2 mango f2

    table内包含列名和具体的行数据;
    不同类型的table,列的名字和数量都可能不同;

    from fport to toport location
    192.168.1.1 11 192.168.1.12 11 chaoyang
    192.168.1.2 22 192.168.1.13 22 tongzhou
    

    搜索要求

    支持所有类型的table的搜索;

    支持全字段的搜索;

    只返回表内命中的行,并进行高亮;

    二、开发环境

    elasticsearch 6.8.12
    
    java 12.0.2 2019-07-16
    
    Java(TM) SE Runtime Environment (build 12.0.2+10)
    
    Java HotSpot(TM) 64-Bit Server VM (build 12.0.2+10, mixed mode, sharing)
    

    三、elastic search对array的支持情况

    扁平化数组元素

    默认情况下elastic search会将数组内部对象的字段进行扁平化处理,这样就会丢失掉元素的独立性。

    直接index一个文档

    PUT my_array_index/_doc/1
    {
      "group" : "fans",
      "user" : [
        {
          "first" : "John",
          "last" :  "Smith"
        },
        {
          "first" : "Alice",
          "last" :  "White"
        }
      ]
    }
     
    {
        "_index":"my_array_index",
        "_type":"_doc",
        "_id":"1",
        "_version":1,
        "result":"created",
        "_shards":{
            "total":2,
            "successful":1,
            "failed":0
        },
        "_seq_no":0,
        "_primary_term":1
    }
    

    elastic search 内部会将文档转化为如下形式再进行索引

    {
      "group" :        "fans",
      "user.first" : [ "alice", "john" ],
      "user.last" :  [ "smith", "white" ]
    }
    

    扁平化处理将所有数组元素对象的相同字段值合并到一起作为一个数组,这样就丢失了user.first和user.last之间的对应关系,类似下边的查询即使没有Alice Smith这个人也可以命中

    GET my_index/_search
    {
      "query": {
        "bool": {
          "must": [
            { "match": { "user.first": "Alice" }},
            { "match": { "user.last":  "Smith" }}
          ]
        }
      }
    }
     
     
    {
        "took":2,
        "timed_out":false,
        "_shards":{
            "total":5,
            "successful":5,
            "skipped":0,
            "failed":0
        },
        "hits":{
            "total":1,
            "max_score":0.5753642,
            "hits":[
                {
                    "_index":"my_array_index",
                    "_type":"_doc",
                    "_id":"1",
                    "_score":0.5753642,
                    "_source":{
                        "group":"fans",
                        "user":[
                            {
                                "first":"John",
                                "last":"Smith"
                            },
                            {
                                "first":"Alice",
                                "last":"White"
                            }
                        ]
                    }
                }
            ]
        }
    }
    

    使用nested数据类型文档化数组元素

    elastic search内部提供了nested数据类型,可以将数组元素作为单独的隐藏的内部文档进行索引,从而保持文档之间的独立性;

    将字段映射为nested类型

    PUT my_nested_index
    {
      "mappings": {
        "_doc": {
          "properties": {
            "user": {
              "type": "nested"
            }
          }
        }
      }
    }
     
    {
        "acknowledged":true,
        "shards_acknowledged":true,
        "index":"my_nested_index"
    }
    

    index文档

    PUT my_nested_index/_doc/1
    {
      "group" : "fans",
      "user" : [
        {
          "first" : "John",
          "last" :  "Smith"
        },
        {
          "first" : "Alice",
          "last" :  "White"
        }
      ]
    }
     
    {
        "_index":"my_nested_index",
        "_type":"_doc",
        "_id":"1",
        "_version":1,
        "result":"created",
        "_shards":{
            "total":2,
            "successful":1,
            "failed":0
        },
        "_seq_no":0,
        "_primary_term":1
    }
    

    elastic search提供了单独的nested query 来支持nested类型

    GET my_nested_index/_search
    {
      "query": {
        "nested": {
          "path": "user",
          "query": {
            "bool": {
              "must": [
                { "match": { "user.first": "Alice" }},
                { "match": { "user.last":  "Smith" }}
              ]
            }
          }
        }
      }
    }
     
    {
        "took":3,
        "timed_out":false,
        "_shards":{
            "total":5,
            "successful":5,
            "skipped":0,
            "failed":0
        },
        "hits":{
            "total":0,
            "max_score":null,
            "hits":[
     
            ]
        }
    }
    

    nested query提供了inner_hits类支持字段高亮,从高亮信息中可以看到,offset字段指出了命中了数组中的第几个元素;

    GET my_nested_index/_search
    {
      "query": {
        "nested": {
          "path": "user",
          "query": {
            "bool": {
              "should": [
                { "match": { "user.first": "Alice" }},
                { "match": { "user.last":  "smith" }}
              ]
            }
          },
          "inner_hits": {
            "highlight": {
              "fields": {
                "*": {}
              }
            }
          }
        }
      }
    }
     
    {
        "took":8,
        "timed_out":false,
        "_shards":{
            "total":5,
            "successful":5,
            "skipped":0,
            "failed":0
        },
        "hits":{
            "total":1,
            "max_score":0.6931472,
            "hits":[
                {
                    "_index":"my_nested_index",
                    "_type":"_doc",
                    "_id":"1",
                    "_score":0.6931472,
                    "_source":{
                        "group":"fans",
                        "user":[
                            {
                                "first":"John",
                                "last":"Smith"
                            },
                            {
                                "first":"Alice",
                                "last":"White"
                            }
                        ]
                    },
                    "inner_hits":{
                        "user":{
                            "hits":{
                                "total":2,
                                "max_score":0.6931472,
                                "hits":[
                                    {
                                        "_index":"my_nested_index",
                                        "_type":"_doc",
                                        "_id":"1",
                                        "_nested":{
                                            "field":"user",
                                            "offset":0
                                        },
                                        "_score":0.6931472,
                                        "_source":{
                                            "first":"John",
                                            "last":"Smith"
                                        },
                                        "highlight":{
                                            "user.last":[
                                                "<em>Smith</em>"
                                            ]
                                        }
                                    },
                                    {
                                        "_index":"my_nested_index",
                                        "_type":"_doc",
                                        "_id":"1",
                                        "_nested":{
                                            "field":"user",
                                            "offset":1
                                        },
                                        "_score":0.6931472,
                                        "_source":{
                                            "first":"Alice",
                                            "last":"White"
                                        },
                                        "highlight":{
                                            "user.first":[
                                                "<em>Alice</em>"
                                            ]
                                        }
                                    }
                                ]
                            }
                        }
                    }
                }
            ]
        }
    }
    

    总结

    经过以上的研究可以看到,elastic search提供的nested数据类型基本满足我们的目标要求,接下来使用具体的table数据做进一步的研究;

    四、使用nested数据类型索引Table数据

    elastic search索引数据结构

    字段名字 字段类型 描述
    id string 主键
    name string table的名字
    creator string 创建者
    content (object) array 行数据数组

    elastic search mapping

    PUT tables
    {
      "mappings": {
        "_doc": {
          "properties": {
            "id": {
              "type": "keyword"
            },
            "name": {
              "type": "keyword"
            },
            "creator": {
              "type": "keyword"
            },
            "content": {
              "type": "nested"
            }
          }
        }
      }
    }
     
    {
        "acknowledged": true,
        "shards_acknowledged": true,
        "index": "tables"
    }
    

    index 一个Table data

    PUT tables/_doc/1
    {
        "id":"1",
        "name":"table1",
        "creator":"mango",
        "content":[
            {
                "0":"192.168.1.1",
                "1":"11",
                "2":"192.168.1.12",
                "3":"11",
                "4":"chaoyang"
            },
            {
                "0":"192.168.1.2",
                "1":"22",
                "2":"192.168.1.13",
                "3":"22",
                "4":"tongzhou"
            },
            {
                "0":"192.168.3",
                "1":"33",
                "2":"192.168.1.14",
                "3":"33",
                "4":"daxing"
            }
        ]
    }
     
    {
        "_index":"tables",
        "_type":"_doc",
        "_id":"1",
        "_version":1,
        "result":"created",
        "_shards":{
            "total":2,
            "successful":1,
            "failed":0
        },
        "_seq_no":0,
        "_primary_term":1
    }
    

    search Table data

    搜索所有列

    限制只返回Table的元数据信息

    限制只返回命中行的信息

    返回命中行的高亮信息

    post /tables/_search/
    {
        "from":0,
        "size":20,
        "_source":{
            "excludes":[
                "content"
            ]
        },
        "query":{
            "nested":{
                "path":"content",
                "query":{
                    "query_string":{
                        "fields":[
                            "content.*"
                        ],
                        "query":"tongzhou  192.168.1.1"
                    }
                },
                "inner_hits":{
                    "from":0,
                    "size":2,
                    "highlight":{
                        "fields":{
                            "*":{
    
                            }
                        }
                    }
                }
            }
        }
    }
     
     
    {
        "took":19,
        "timed_out":false,
        "_shards":{
            "total":5,
            "successful":5,
            "skipped":0,
            "failed":0
        },
        "hits":{
            "total":1,
            "max_score":0.9808292,
            "hits":[
                {
                    "_index":"tables",
                    "_type":"_doc",
                    "_id":"1",
                    "_score":0.9808292,
                    "_source":{
                        "creator":"mango",
                        "name":"table1",
                        "id":"1"
                    },
                    "inner_hits":{
                        "content":{
                            "hits":{
                                "total":2,
                                "max_score":0.9808292,
                                "hits":[
                                    {
                                        "_index":"tables",
                                        "_type":"_doc",
                                        "_id":"1",
                                        "_nested":{
                                            "field":"content",
                                            "offset":0
                                        },
                                        "_score":0.9808292,
                                        "_source":{
                                            "0":"192.168.1.1",
                                            "1":"11",
                                            "2":"192.168.1.12",
                                            "3":"11",
                                            "4":"chaoyang"
                                        },
                                        "highlight":{
                                            "content.0":[
                                                "<em>192.168.1.1</em>"
                                            ]
                                        }
                                    },
                                    {
                                        "_index":"tables",
                                        "_type":"_doc",
                                        "_id":"1",
                                        "_nested":{
                                            "field":"content",
                                            "offset":1
                                        },
                                        "_score":0.9808292,
                                        "_source":{
                                            "0":"192.168.1.2",
                                            "1":"22",
                                            "2":"192.168.1.13",
                                            "3":"22",
                                            "4":"tongzhou"
                                        },
                                        "highlight":{
                                            "content.4":[
                                                "<em>tongzhou</em>"
                                            ]
                                        }
                                    }
                                ]
                            }
                        }
                    }
                }
            ]
        }
    }
    
  • 相关阅读:
    Log4Net详解(2)结构篇
    vs2012中使用Spring.NET报错:Spring.Context.Support.ContextRegistry 的类型初始值设定项引发异常
    vs2010无法打开项目文件的解决方法
    Spring.NET使用assembly方式设置配置文件
    WebService生成XML文档时出错。不应是类型XXXX。使用XmlInclude或SoapInclude属性静态指定非已知的类型。
    [转贴]如何做好一个垂直搜索引擎
    怎样预防RSI呢?
    推荐一个打折的站点
    五子棋程序
    共享两本C++的好书
  • 原文地址:https://www.cnblogs.com/wufengtinghai/p/15240535.html
Copyright © 2011-2022 走看看