在聚合的分组统计中我们会面临两种分组元素类型:连续型如时间,自然数等、离散型如地点、产品等。离散型数据本身就代表不同的组别,但连续型数据则需要手工按等长间隔进行切分了。下面是一个按价钱段聚合的例子:
POST /cartxns/_search
{
"size" : 1,
"aggs": {
"sales_per_pricerange": {
"histogram": {
"field": "price",
"interval": 20000
},
"aggs": {
"total sales": {
"sum": {
"field": "price"
}
}
}
}
}
}
}
在上面这个例子中我们把价钱按20000进行分段。得出0-19999,20000-39999,40000-59999 ... 价格段的度量:
"aggregations" : {
"sales_per_pricerange" : {
"buckets" : [
{
"key" : 0.0,
"doc_count" : 3,
"total sales" : {
"value" : 37000.0
}
},
{
"key" : 20000.0,
"doc_count" : 4,
"total sales" : {
"value" : 95000.0
}
},
{
"key" : 40000.0,
"doc_count" : 0,
"total sales" : {
"value" : 0.0
}
},
{
"key" : 60000.0,
"doc_count" : 0,
"total sales" : {
"value" : 0.0
}
},
{
"key" : 80000.0,
"doc_count" : 1,
"total sales" : {
"value" : 80000.0
}
}
]
}
}
在elastic4s中是这样表达的:
val aggHist = search("cartxns").aggregations(
histogramAggregation("sales_per_price")
.field("price")
.interval(20000).subAggregations(
sumAggregation("total_sales").field("price")
)
)
println(aggHist.show)
val histResult = client.execute(aggHist).await
if (histResult.isSuccess)
histResult.result.aggregations.histogram("sales_per_price").buckets
.foreach(hb => println(s"${hb.key},${hb.docCount}:${hb.sum("total_sales").value}"))
else println(s"error: ${histResult.error.reason}")
....
POST:/cartxns/_search?
StringEntity({"aggs":{"sales_per_price":{"histogram":{"interval":20000.0,"field":"price"},"aggs":{"total_sales":{"sum":{"field":"price"}}}}}},Some(application/json))
0.0,3:37000.0
20000.0,4:95000.0
40000.0,0:0.0
60000.0,0:0.0
80000.0,1:80000.0
下面这个按车款分组统计的就是一个离散元素的聚合统计了:
POST /cartxns/_search
{
"size" : 1,
"aggs": {
"avage price per model" : {
"terms": {"field" : "make.keyword"},
"aggs": {
"average price": {
"avg": {"field": "price"}
},
"max price" : {
"max": {
"field": "price"
}
},
"min price" : {
"min": {
"field": "price"
}
}
}
}
}
}
我们可以得到每一款车的平均售价、最低最高售价:
"aggregations" : {
"avage price per model" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "honda",
"doc_count" : 3,
"max price" : {
"value" : 20000.0
},
"average price" : {
"value" : 16666.666666666668
},
"min price" : {
"value" : 10000.0
}
},
{
"key" : "ford",
"doc_count" : 2,
"max price" : {
"value" : 30000.0
},
"average price" : {
"value" : 27500.0
},
"min price" : {
"value" : 25000.0
}
},
{
"key" : "toyota",
"doc_count" : 2,
"max price" : {
"value" : 15000.0
},
"average price" : {
"value" : 13500.0
},
"min price" : {
"value" : 12000.0
}
},
{
"key" : "bmw",
"doc_count" : 1,
"max price" : {
"value" : 80000.0
},
"average price" : {
"value" : 80000.0
},
"min price" : {
"value" : 80000.0
}
}
]
}
}
elastic4s示范如下:
val aggDisc = search("cartxns").aggregations(
termsAgg("prices_per_model","make.keyword").subAggregations(
avgAgg("average_price","price"),
minAgg("min_price","price"),
maxAgg("max_price","price")
)
)
println(aggDisc.show)
val discResult = client.execute(aggDisc).await
if (discResult.isSuccess)
discResult.result.aggregations.terms("prices_per_model").buckets
.foreach(mb =>
println(s"${mb.key},${mb.docCount}:${mb.avg("average_price").value}," +
s"${mb.min("min_price").value.getOrElse(0)}," +
s"${mb.max("max_price").value.getOrElse(0)}"))
else println(s"error: ${discResult.error.causedBy.getOrElse("unknown")}")
...
POST:/cartxns/_search?
StringEntity({"aggs":{"prices_per_model":{"terms":{"field":"make.keyword"},"aggs":{"average_price":{"avg":{"field":"price"}},"min_price":{"min":{"field":"price"}},"max_price":{"max":{"field":"price"}}}}}},Some(application/json))
honda,3:16666.666666666668,10000.0,20000.0
ford,2:27500.0,25000.0,30000.0
toyota,2:13500.0,12000.0,15000.0
bmw,1:80000.0,80000.0,80000.0
date_histogram是一种按时间间隔聚合的统计方法。对于按时间趋势变化的数据分析十分有用:
POST /cartxns/_search
{
"aggs": {
"sales_per_month": {
"date_histogram": {
"field": "sold",
"calendar_interval":"1M",
"format": "yyyy-MM-dd"
}
}
}
}
...
"aggregations" : {
"sales_per_month" : {
"buckets" : [
{
"key_as_string" : "2014-01-01",
"key" : 1388534400000,
"doc_count" : 1
},
{
"key_as_string" : "2014-02-01",
"key" : 1391212800000,
"doc_count" : 1
},
{
"key_as_string" : "2014-03-01",
"key" : 1393632000000,
"doc_count" : 0
},
{
"key_as_string" : "2014-04-01",
"key" : 1396310400000,
"doc_count" : 0
},
{
"key_as_string" : "2014-05-01",
"key" : 1398902400000,
"doc_count" : 1
},
{
"key_as_string" : "2014-06-01",
"key" : 1401580800000,
"doc_count" : 0
},
{
"key_as_string" : "2014-07-01",
"key" : 1404172800000,
"doc_count" : 1
},
{
"key_as_string" : "2014-08-01",
"key" : 1406851200000,
"doc_count" : 1
},
{
"key_as_string" : "2014-09-01",
"key" : 1409529600000,
"doc_count" : 0
},
{
"key_as_string" : "2014-10-01",
"key" : 1412121600000,
"doc_count" : 1
},
{
"key_as_string" : "2014-11-01",
"key" : 1414800000000,
"doc_count" : 2
}
]
}
}
上面这个例子产生以月为单元的bucket。elastic4s示范:
val aggDateHist = search("cartxns").aggregations(
dateHistogramAggregation("sales_per_month")
.field("sold")
.calendarInterval(DateHistogramInterval.Month)
.format("yyyy-MM-dd")
.minDocCount(1)
)
println(aggDateHist.show)
val dtHistResult = client.execute(aggDateHist).await
if (dtHistResult.isSuccess)
dtHistResult.result.aggregations.dateHistogram("sales_per_month").buckets
.foreach(db => println(s"${db.date},${db.docCount}"))
else println(s"error: ${dtHistResult.error.causedBy.getOrElse("unknown")}")
...
POST:/cartxns/_search?
StringEntity({"aggs":{"sales_per_month":{"date_histogram":{"calendar_interval":"1M","min_doc_count":1,"format":"yyyy-MM-dd","field":"sold"}}}},Some(application/json))
2014-01-01,1
2014-02-01,1
2014-05-01,1
2014-07-01,1
2014-08-01,1
2014-10-01,1
2014-11-01,2
在以月划分bucket后可以再进行每个月的深度聚合:
POST /cartxns/_search
{
"aggs": {
"sales_per_month": {
"date_histogram": {
"field": "sold",
"calendar_interval":"1M",
"format": "yyyy-MM-dd"
},
"aggs": {
"per_make_sum": {
"terms": {
"field": "make.keyword",
"size": 10
},
"aggs": {
"sum_price": {
"sum": {"field": "price"}
}
}
},
"total_sum": {
"sum": {
"field": "price"
}
}
}
}
}
}
我们可以得到每个月的销售总额、每个车款每个月的销售,如下:
"aggregations" : {
"sales_per_month" : {
"buckets" : [
{
"key_as_string" : "2014-01-01",
"key" : 1388534400000,
"doc_count" : 1,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "bmw",
"doc_count" : 1,
"sum_price" : {
"value" : 80000.0
}
}
]
},
"total_sum" : {
"value" : 80000.0
}
},
{
"key_as_string" : "2014-02-01",
"key" : 1391212800000,
"doc_count" : 1,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "ford",
"doc_count" : 1,
"sum_price" : {
"value" : 25000.0
}
}
]
},
"total_sum" : {
"value" : 25000.0
}
},
{
"key_as_string" : "2014-03-01",
"key" : 1393632000000,
"doc_count" : 0,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
},
"total_sum" : {
"value" : 0.0
}
},
{
"key_as_string" : "2014-04-01",
"key" : 1396310400000,
"doc_count" : 0,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
},
"total_sum" : {
"value" : 0.0
}
},
{
"key_as_string" : "2014-05-01",
"key" : 1398902400000,
"doc_count" : 1,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "ford",
"doc_count" : 1,
"sum_price" : {
"value" : 30000.0
}
}
]
},
"total_sum" : {
"value" : 30000.0
}
},
{
"key_as_string" : "2014-06-01",
"key" : 1401580800000,
"doc_count" : 0,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
},
"total_sum" : {
"value" : 0.0
}
},
{
"key_as_string" : "2014-07-01",
"key" : 1404172800000,
"doc_count" : 1,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "toyota",
"doc_count" : 1,
"sum_price" : {
"value" : 15000.0
}
}
]
},
"total_sum" : {
"value" : 15000.0
}
},
{
"key_as_string" : "2014-08-01",
"key" : 1406851200000,
"doc_count" : 1,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "toyota",
"doc_count" : 1,
"sum_price" : {
"value" : 12000.0
}
}
]
},
"total_sum" : {
"value" : 12000.0
}
},
{
"key_as_string" : "2014-09-01",
"key" : 1409529600000,
"doc_count" : 0,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
},
"total_sum" : {
"value" : 0.0
}
},
{
"key_as_string" : "2014-10-01",
"key" : 1412121600000,
"doc_count" : 1,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "honda",
"doc_count" : 1,
"sum_price" : {
"value" : 10000.0
}
}
]
},
"total_sum" : {
"value" : 10000.0
}
},
{
"key_as_string" : "2014-11-01",
"key" : 1414800000000,
"doc_count" : 2,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "honda",
"doc_count" : 2,
"sum_price" : {
"value" : 40000.0
}
}
]
},
"total_sum" : {
"value" : 40000.0
}
}
]
}
}
用elastic4s可以这样写:
val aggMonthSales= search("cartxns").aggregations(
dateHistogramAggregation("sales_per_month")
.field("sold")
.calendarInterval(DateHistogramInterval.Month)
.format("yyyy-MM-dd")
.minDocCount(1).subAggregations(
termsAgg("month_make","make.keyword").subAggregations(
sumAggregation("month_total_per_make").field("price")
),
sumAggregation("monthly_total").field("price")
)
)
println(aggMonthSales.show)
val monthSalesResult = client.execute(aggMonthSales).await
if (monthSalesResult.isSuccess)
monthSalesResult.result.aggregations.dateHistogram("sales_per_month").buckets
.foreach { sb =>
println(s"${sb.date},${sb.docCount},${sb.sum("monthly_total").value}")
sb.terms("month_make").buckets
.foreach(mb =>
println(s"${mb.key},${mb.docCount},${mb.sum("month_total_per_make").value}"))
}
else println(s"error: ${monthSalesResult.error.causedBy.getOrElse("unknown")}")
...
POST:/cartxns/_search?
StringEntity({"aggs":{"sales_per_month":{"date_histogram":{"calendar_interval":"1M","min_doc_count":1,"format":"yyyy-MM-dd","field":"sold"},"aggs":{"month_make":{"terms":{"field":"make.keyword"},"aggs":{"month_total_per_make":{"sum":{"field":"price"}}}},"monthly_total":{"sum":{"field":"price"}}}}}},Some(application/json))
2014-01-01,1,80000.0
bmw,1,80000.0
2014-02-01,1,25000.0
ford,1,25000.0
2014-05-01,1,30000.0
ford,1,30000.0
2014-07-01,1,15000.0
toyota,1,15000.0
2014-08-01,1,12000.0
toyota,1,12000.0
2014-10-01,1,10000.0
honda,1,10000.0
2014-11-01,2,40000.0
honda,2,40000.0