zoukankan      html  css  js  c++  java
  • 如何计算动态粗略计算99%分位,99.9%分位

    假设你写了一个程序作为http客户端,进行webhook的请求,你会计算每个http请求所需要的时间,然后把这些信息打印在日志里,但是把信息写到日志里,从而查看程序的http请求情况,用来监控自己的程序和对端的http server的运行状况。当程序部署太多,到每台机器上看这些日志信息,会很麻烦,你这时候可能想着添加一个上报,当上报的http请求时间出现异常的时候,就进行报警。起初,你可能会全量上报,那时候你的程序发出的http请求还比较少,上报系统可以承载住这样的负荷,但是后来你发现,你的程序进行的http请求越来越多,上报系统觉得承受不了这么大的负荷。这时候,你有多个选择,一种是将上报平均时间,这种上报方式上报的信息很少,但是很容易受到极端慢的请求的影响。或者,你可选择分多个时间段上报,如果你对http server很了解,而且运行时间很久了,也不大会变动,你可以hard code一系列的时间段,然后进行上报。但是,如果你的http server不受到你的控制,变动又很大,那上报99%,99.9%分位将会是更好的选择。下面略微改写folly中的tdigest的代码,给出了下面的实现代码:

    #include <cassert>
    #include <cstddef>
    #include <vector>

    class
    Centroid { public: explicit Centroid(double mean = 0.0, double weight = 1.0) : mean_(mean), weight_(weight) { assert(weight > 0); } double mean() const; double weight() const; // Adds the sum/weight to this centroid, and returns the new sum. double add(double sum, double weight); bool operator<(const Centroid& other) const; private: double mean_; double weight_; }; inline double Centroid::mean() const { return mean_; } inline double Centroid::weight() const { return weight_; } inline double Centroid::add(double sum, double weight) { sum += (mean_ * weight_); weight_ += weight; mean_ = sum /weight_; return sum; } inline bool Centroid::operator<(const Centroid& other) const { return mean() < other.mean(); } class TDigest { public: explicit TDigest(size_t maxSize = 100); /* * Returns a new TDigest constructed with values merged from the current * digest and the given sortedValues. */ TDigest merge_sorted(const std::vector<double> &sortedValues) const; /* * Returns a new TDigest constructed with values merged from the current * digest and the given unsortedValues. */ TDigest merge_unsorted(std::vector<double> *unsortedValues) const; /* * Estimates the value of the given quantile. */ double estimateQuantile(double q) const; double mean() const { return count_ > 0 ? sum_ / count_ : 0; } double sum() const { return sum_; } double count() const { return count_; } double min() const { return min_; } double max() const { return max_; } double empty() const { return centroids_.empty(); } const std::vector<Centroid>& getCentroids() const { return centroids_; } size_t maxSize() const { return maxSize_; } private: std::vector<Centroid> centroids_; size_t maxSize_; double sum_; double count_; double max_; double min_; };
    #include "tdigest.h"

    #include <algorithm>

    static
    double k_to_q(double k, double d) { double k_div_d = k / d; if (k_div_d >= 0.5) { double base = 1 - k_div_d; return 1 - 2 * base * base; } else { return 2 * k_div_d * k_div_d; } } static double clamp(double v, double lo, double hi) { if (v > hi) { return hi; } else if (v < lo) { return lo; } return v; } tdigest::tdigest(size_t maxsize) : maxsize_(maxsize), sum_(0.0), count_(0.0), max_(nan), min_(nan) {} // merge unsorted values by first sorting them. tdigest tdigest::merge_unsorted(std::vector<double> *unsortedvalues) const { std::sort(unsortedvalues->begin(), unsortedvalues->end()); return merge_sorted(*unsortedvalues); } tdigest tdigest::merge_sorted(const std::vector<double> &sortedvalues) const { if (sortedvalues.empty()) { return *this; } tdigest result(maxsize_); result.count_ = count_ + sortedvalues.size(); double maybemin = *(sortedvalues.begin()); double maybemax = *(sortedvalues.end() - 1); if (count_ > 0) { // we know that min_ and max_ are numbers result.min_ = (std::min)(min_, maybemin); result.max_ = (std::max)(max_, maybemax); } else { // we know that min_ and max_ are nan. result.min_ = maybemin; result.max_ = maybemax; } std::vector<centroid> compressed; compressed.reserve(maxsize_); double k_limit = 1; double q_limit_times_count = k_to_q(k_limit++, maxsize_) * result.count_; auto it_centroids = centroids_.begin(); auto it_sortedvalues = sortedvalues.begin(); centroid cur; if (it_centroids != centroids_.end() && it_centroids->mean() < *it_sortedvalues) { cur = *it_centroids++; } else { cur = centroid(*it_sortedvalues++, 1.0); } double weightsofar = cur.weight(); // keep track of sums along the way to reduce expensive floating points double sumstomerge = 0; double weightstomerge = 0; while (it_centroids != centroids_.end() || it_sortedvalues != sortedvalues.end()) { centroid next; if (it_centroids != centroids_.end() && (it_sortedvalues == sortedvalues.end() || it_centroids->mean() < *it_sortedvalues)) { next = *it_centroids++; } else { next = centroid(*it_sortedvalues++, 1.0); } double nextsum = next.mean() * next.weight(); weightsofar += next.weight(); if (weightsofar <= q_limit_times_count) { sumstomerge += nextsum; weightstomerge += next.weight(); } else { result.sum_ += cur.add(sumstomerge, weightstomerge); sumstomerge = 0; weightstomerge = 0; compressed.push_back(cur); q_limit_times_count = k_to_q(k_limit++, maxsize_) * result.count_; cur = next; } } result.sum_ += cur.add(sumstomerge, weightstomerge); compressed.push_back(cur); compressed.shrink_to_fit(); // deal with floating point precision std::sort(compressed.begin(), compressed.end()); result.centroids_ = std::move(compressed); return result; } double tdigest::estimatequantile(double q) const { if (centroids_.empty()) { return 0.0; } double rank = q * count_; size_t pos; double t; if (q > 0.5) { if (q >= 1.0) { return max_; } pos = 0; t = count_; for (auto rit = centroids_.rbegin(); rit != centroids_.rend(); ++rit) { t -= rit->weight(); if (rank >= t) { pos = std::distance(rit, centroids_.rend()) - 1; break; } } } else { if (q <= 0.0) { return min_; } pos = centroids_.size() - 1; t = 0; for (auto it = centroids_.begin(); it != centroids_.end(); ++it) { if (rank < t + it->weight()) { pos = std::distance(centroids_.begin(), it); break; } t += it->weight(); } } double delta = 0; double min = min_; double max = max_; if (centroids_.size() > 1) { if (pos == 0) { delta = centroids_[pos + 1].mean() - centroids_[pos].mean(); max = centroids_[pos + 1].mean(); } else if (pos == centroids_.size() - 1) { delta = centroids_[pos].mean() - centroids_[pos - 1].mean(); min = centroids_[pos - 1].mean(); } else { delta = (centroids_[pos + 1].mean() - centroids_[pos - 1].mean()) / 2; min = centroids_[pos - 1].mean(); max = centroids_[pos + 1].mean(); } } auto value = centroids_[pos].mean() + ((rank - t) /centroids_[pos].weight() - 0.5) * delta; return clamp(value, min, max); }
    #include <gtest/gtest.h>

    #include "tdigest.h"

    class
    UnitTest : public testing::Test { public: UnitTest() {} ~UnitTest() {} void SetUp() override {} void TearDown() override {} void UnitTest_TDigestTest() { std::random_device rd{}; std::mt19937 gen{rd()}; TDigest td(1000); const size_t max_i = 100; const size_t max_j = 1000; const size_t all_size = max_i * max_j; std::vector<double> all_numbers; all_numbers.reserve(all_size); std::normal_distribution<double> d{50, 2}; for (size_t i = 0; i != max_i; ++i) { std::vector<double> numbers; for (size_t j = 0; j != max_j; ++j) { auto val = d(gen); numbers.push_back(val); all_numbers.push_back(val); } td = td.merge_unsorted(&numbers); numbers.clear(); } std::sort(all_numbers.begin(), all_numbers.end()); std::cout << "real 99%: " << all_numbers[(all_size * 99) / 100] << std::endl; std::cout << "estimate 99%: " << td.estimateQuantile(0.99) << std::endl; std::cout << "real 99.9%: " << all_numbers[(all_size * 999) / 1000] << std::endl; std::cout << "estimate 99.9%: " << td.estimateQuantile(0.999) << std::endl; } }; TEST_F(UnitTest, UnitTest_TDigestTest) { UnitTest_TDigestTest(); }

    tdigest的思想大致是这样的,给定若干个槽位,现在假设有1000个槽位,通过特定的映射方式,使每个槽位对应的weight不同,两边每个槽位对应的weight比较小,而中间每个槽位对应的weight比较大,从而可以获得比较精确的99%,99.9%这些的信息。这里面使用的映射方式是:k_to_q。对于merge_sorted函数中的

    weightsofar <= q_limit_times_count

    判断,我有一个想法,这里,是否改为

    weightsofar += next.weight();

    之前和之后,哪个距离q_limit_times_count更近,就取哪个,会不会得到的结果更加准确。这点,暂时没有测试,有需要可以加上,或者读者自己加上测试一下。

  • 相关阅读:
    csharp: Cyotek.GhostScript.PdfConversion pdf convert image
    csharp: using Acrobat.dll pdf convert images in winform
    机器学习实战---K均值聚类算法
    机器学习实战---决策树CART回归树实现
    机器学习实战---决策树CART简介及分类树实现
    机器学习实战---线性回归(更好的使用正规方程求解)
    机器学习实战---逻辑回归梯度上升(更好的理解sigmoid函数的含义并改进)
    机器学习实战---朴素贝叶斯算法使用K折交叉验证
    机器学习实战---朴素贝叶斯算法
    机器学习实战---决策树ID3算法
  • 原文地址:https://www.cnblogs.com/albizzia/p/14161589.html
Copyright © 2011-2022 走看看