util/Stats.cpp (226 lines of code) (raw):
/**
* Copyright (c) 2014-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
/*
* Stats.cpp
*
* Created on: Oct 18, 2011
* Author: ldemailly
*/
#include "wdt/util/Stats.h"
#include <math.h>
#include <fstream>
#include <iomanip>
#include <sstream>
namespace facebook {
namespace wdt {
/*static*/
const size_t Histogram::kLastIndex; // storage placeholder- value is from .h
constexpr int32_t Histogram::kHistogramBuckets[]; // buckets, values are in .h
static const int32_t kFirstValue = Histogram::kHistogramBuckets[0];
static const int32_t kLastValue =
Histogram::kHistogramBuckets[Histogram::kLastIndex - 1];
int32_t* initHistLookup() {
int32_t* res = (int32_t*)calloc(sizeof(int32_t), kLastValue);
CHECK(res != nullptr);
int32_t idx = 0;
for (int i = 0; i < kLastValue; ++i) {
if (i >= Histogram::kHistogramBuckets[idx]) {
++idx;
}
res[i] = idx;
}
CHECK_EQ(idx, Histogram::kLastIndex - 1);
return res;
}
static const int32_t* kVal2Bucket = initHistLookup();
void Counter::record(int64_t v) {
int64_t newCount = ++count_;
sum_ += v;
if (newCount == 1) {
realMin_ = min_ = max_ = v;
}
// don't count 0s in the min - makes for a slightly more interesting min
// in most cases
if (v && ((min_ == 0) || (v < min_))) {
min_ = v;
}
// real min (can be 0):
if (v < realMin_ ) {
realMin_ = v;
}
if (v > max_) {
max_ = v;
}
int64_t v2 = 0;
if (v > INT32_MAX || v < INT32_MIN) {
LOG(WARNING) << "v too big for stddev " << v;
} else {
v2 = v * v;
}
sumOfSquares_ += v2; // atomic_add(sumOfSquares_, v2);
}
void Counter::merge(const Counter& c) {
// merge count_, min_, max_, sum_, sumOfSquares_;
count_ += c.count_;
if (c.min_ && c.min_ < min_) {
min_ = c.min_;
}
if (c.max_ > max_) {
max_ = c.max_;
}
sum_ += c.sum_;
sumOfSquares_ += c.sumOfSquares_;
}
double Counter::getStdDev() const {
if (count_ < 1) {
return 0.0;
}
if (sum_ > INT32_MAX || sum_ < INT32_MIN ||
(sumOfSquares_ - sum_ * sum_ / count_) < 0) {
LOG(WARNING) << "std dev wrap " << count_ << " " << sum_ << " "
<< sumOfSquares_;
return 0.0;
}
double sigma = ((double)sumOfSquares_ - sum_ * sum_ / count_) / count_;
return sqrt(sigma);
}
void Counter::print(std::ostream& os, double multiplier) const {
if (multiplier == 0.) {
multiplier = printScale_;
}
int64_t intMult = (int64_t)multiplier;
os << getCount() << "," << multiplier * getAverage() << ",";
if (intMult == multiplier) {
// still int based
os << intMult * getMin() << "," << intMult * getMax() << ","
<< multiplier * getStdDev();
} else {
os << multiplier * getMin() << "," << multiplier * getMax() << ","
<< multiplier * getStdDev();
}
}
void Counter::printCounterHeader(std::ostream& os) const {
// should match the above
os << "count,avg,min,max,stddev";
}
void Histogram::record(int64_t value) {
Counter::record(value);
int64_t scaledVal = (value - offset_);
// If you touch those next lines, rerun the benchmark mentioned in Stats.h
if (/*((divider_>>31)==0) &&*/ ((scaledVal >> 31) == 0)) {
scaledVal = (int32_t)scaledVal / (int32_t)divider_;
} else {
// 64 bits is 2x more expensive so do it only if needed
scaledVal /= divider_;
}
int32_t idx = 0;
if (scaledVal >= kLastValue) {
idx = kLastIndex;
} else if (scaledVal >= kFirstValue) {
idx = kVal2Bucket[scaledVal];
} // else it's < and idx 0
VLOG(3) << "addHist v " << value << " d " << divider_ << " o " << offset_
<< " s " << scaledVal << " -> " << idx << " (lastval " << kLastValue
<< ")" << std::endl;
++hdata_[idx]; // atomic_add(data[idx], 1);
}
void Histogram::merge(const Histogram& h) {
Counter::merge(h);
for (size_t i = 0; i <= kLastIndex; ++i) {
hdata_[i] += h.hdata_[i];
}
}
Histogram::Histogram(int32_t scale, double p1, double p2, int64_t offset)
: Counter(1),
divider_(scale),
offset_(offset),
percentile1_(p1),
percentile2_(p2) {
VLOG(100) << "New Histogram " << this;
Counter();
memset((void*)hdata_, 0, sizeof(hdata_));
}
Histogram::~Histogram() {
VLOG(100) << "~Histogram " << this;
}
void Histogram::setPercentile1(double p) {
percentile1_ = p;
}
void Histogram::setPercentile2(double p) {
percentile2_ = p;
}
double Histogram::calcPercentile(double percentile) const {
if (percentile >= 100) {
return getMax();
}
if (percentile <= 0) {
return getRealMin();
}
// Initial value of prev should in theory be offset_
// but if the data is wrong (smaller than offset - eg 'negative') that
// yields to strangeness (see one bucket test)
int64_t prev = 0;
int64_t total = 0;
const int64_t ctrTotal = getCount();
const int64_t ctrMax = getMax();
const int64_t ctrMin = getRealMin();
double prevPerc = 0;
double perc = 0;
bool found = false;
int64_t cur = offset_;
// last bucket is virtual/special - we'll use max if we reach it
// we also use max if the bucket is past the max for better accuracy
// and the property that target = 100 will always return max
// (+/- rouding issues) and value close to 100 (99.9...) will be close to max
// if the data is not sampled in several buckets
for (size_t i = 0; i < kLastIndex; ++i) {
cur = (int64_t)Histogram::kHistogramBuckets[i] * divider_ + offset_;
total += hdata_[i];
perc = 100. * (double)total / ctrTotal;
if (cur > ctrMax) {
break;
}
if (perc >= percentile) {
found = true;
break;
}
prevPerc = perc;
prev = cur;
}
if (!found) {
// covers the > ctrMax case
cur = ctrMax;
perc = 100.; // can't be removed
}
// Fix up the min too to never return < min and increase low p accuracy
if (prev < ctrMin) {
prev = ctrMin;
}
return (prev + (percentile - prevPerc) * (cur - prev) / (perc - prevPerc));
}
// histogram print
void Histogram::print(std::ostream& os) const {
const int64_t multiplier = divider_;
// calculate the last bucket index
int32_t lastIdx = -1;
for (int i = kLastIndex; i >= 0; --i) {
if (hdata_[i] > 0) {
lastIdx = i;
break;
}
}
if (lastIdx == -1) {
os << "no data" << std::endl;
return;
}
// comment out header in gnuplot data file
os << "# ";
Counter::printCounterHeader(os);
os << ","; // awk -F, '/^count/ {sum+=$6} END {print "sum is", sum}'
Counter::print(os); // the base counter part
os << std::endl << "# range, mid point, percentile, count" << std::endl;
int64_t prev = 0;
int64_t total = 0;
const int64_t ctrTotal = getCount();
auto oldPrecision = os.precision(5);
// we can combine this loop and the calcPercentile() one but it's
// easier to read/maintain/test when separated and it's only 2 pass on
// very little data
// output the data of each bucket of the histogram
for (size_t i = 0; i <= static_cast<size_t>(lastIdx); ++i) {
total += hdata_[i];
// data in each row is separated by comma (",")
if (i > 0) {
os << ">= " << multiplier * prev + offset_ << " ";
}
double perc = 100. * (double)total / ctrTotal;
if (i < kLastIndex) {
int64_t cur = Histogram::kHistogramBuckets[i];
os << "< " << multiplier * cur + offset_ << " ";
double midpt = multiplier * (prev + cur) / 2 + offset_;
os << ", " << midpt << " ";
prev = cur;
} else {
os << ", " << multiplier * prev + offset_ << " ";
}
os << ", " << perc << ", " << hdata_[i] << std::endl;
}
// print the information of target percentiles
os.precision(1);
os << std::fixed << "# target " << percentile1_ << "%,"
<< calcPercentile(percentile1_) << std::endl
<< "# target " << percentile2_ << "%," << calcPercentile(percentile2_)
<< std::endl;
os.unsetf(std::ios_base::floatfield);
os.precision(oldPrecision);
}
void Histogram::reset() {
Counter::reset();
memset((void*)hdata_, 0, sizeof(hdata_));
}
}
} /* namespace facebook::wormhole */