zoukankan      html  css  js  c++  java
  • ElasticSearch中"distinct","count"和"group by"的实现

    最近在业务中需要使用ES来进行数据查询,在某些场景下需要对数据进行去重,以及去重后的统计。为了方便大家理解,特意从SQL角度,方便大家能够理解ES查询语句。

    1 - distinct

    SELECT DISTINCT(user_id) FROM table WHERE user_id_type = 3;
    {
      "query": {
        "term": {
          "user_id_type": 3
        }
      },
      "collapse": {
        "field": "user_id"
      }
    }
    

      

    {
      ...
      "hits": {
        "hits": [
          {
            "_index": "es_qd_mkt_visitor_packet_dev_v1_20180621",
            "_type": "ad_crowd",
            "_source": {
              "user_id": "wx2af8414b502d4ca2_oHtrD0Vxv-_8c678figJNHmtaVQQ",
              "user_id_type": 3
            },
            "fields": {
              "user_id": [
                "wx2af8414b502d4ca2_oHtrD0Vxv-_8c678figJNHmtaVQQ"
              ]
            }
          }
        ]
      }
    }
    

      

    总结:使用collapse字段后,查询结果中[hits]中会出现[fields]字段,其中包含了去重后的user_id

    2 - count + distinct

    SELECT COUNT(DISTINCT(user_id)) FROM table WHERE user_id_type = 3;
    

      

    {
      "query": {
        "term": {
          "user_id_type": 3
        }
      },
      "aggs": {
        "count": {
          "cardinality": {
            "field": "user_id"
          }
        }
      }
    }
    

     

    {
      ...
      "hits": {
      ...
      },
      "aggregations": {
        "count": {
          "value": 121
        }
      }
    }
    

      

    总结:aggscardinality的字段代表需要distinct的字段

    3 - count + group by

    SELECT COUNT(user_id) FROM table GROUP BY user_id_type;
    

      

    {
      "aggs": {
        "user_type": {
          "terms": {
            "field": "user_id_type"
          }
        }
      }
    }
    

      

    {
      ...
      "hits": {
        ...
      },
      "aggregations": {
        "user_type": {
          ...
          "buckets": [
            {
              "key": 4,
              "doc_count": 1220
            },
            {
              "key": 3,
              "doc_count": 488
            }
          ]
        }
      }
    }
    

      

    总结:aggsterms的字段代表需要gruop by的字段

    4 - count + distinct + group by

    SELECT COUNT(DISTINCT(user_id)) FROM table GROUP BY user_id_type;
    

      

    {
      "aggs": {
        "user_type": {
          "terms": {
            "field": "user_id_type"
          },
          "aggs": {
            "count": {
              "cardinality": {
                "field": "user_id"
              }
            }
          }
        }
      }
    }
    {
      ...
      "hits": {
        ...
      },
      "aggregations": {
        "user_type": {
          ...
          "buckets": [
            {
              "key": 4,
              "doc_count": 1220, //去重前数据1220条
              "count": {
                "value": 276 //去重后数据276条
              }
            },
            {
              "key": 3,
              "doc_count": 488, //去重前数据488条
              "count": {
                "value": 121 //去重后数据121条
              }
            }
          ]
        }
      }
    }
    

      

    4 - count + distinct + group by

    SELECT COUNT(DISTINCT(user_id)) FROM table WHERE user_id_type = 2 GROUP BY user_id;
    

      

    总结:对于既有group by又有distinct的查询要求,需要在aggs中嵌套子aggs

    5 - 注意事项

    collapse关键字

    1. 折叠功能ES5.3版本之后才发布的。
    2. 聚合&折叠只能针对keyword类型有效



  • 相关阅读:
    MVC4笔记 @functions @model @using
    NET平台4.0 发布网站流程及出错总结
    C#读写txt文件的方法
    jQuery Validate验证框架详解(转)
    用C#写的读写CSV文件
    devexpress 数据导入(gridcontrol 导出 csv)
    DevExpress XtraGrid 数据导出导入Excel
    DevExpress 表中数据导出
    DevExpress 重编译 替换强命名 修改源码
    Delphi 的运算符列表
  • 原文地址:https://www.cnblogs.com/taozi32/p/10411524.html
Copyright © 2011-2022 走看看