Theoretica
A C++ numerical and automatic mathematical library
histogram.h
Go to the documentation of this file.
1 
5 
6 #ifndef THEORETICA_HISTOGRAM_H
7 #define THEORETICA_HISTOGRAM_H
8 
9 #ifndef THEORETICA_NO_PRINT
10 #include <sstream>
11 #include <ostream>
12 #endif
13 
14 #include <vector>
15 #include "../core/real_analysis.h"
16 #include "../core/dataset.h"
17 #include "./statistics.h"
18 
19 
20 namespace theoretica {
21 
22 
28  class histogram {
29  private:
30 
32  size_t N {0};
33 
35  std::vector<unsigned int> bin_counts;
36 
38  real range_max;
39 
41  real range_min;
42 
44  real value_max;
45 
47  real value_min;
48 
50  real run_average;
51 
53  real run_tss;
54 
55  public:
56 
65  histogram(unsigned int bin_count, real range_min, real range_max)
66  : N(0), value_max(-inf()), value_min(-inf()), run_average(0), run_tss(0) {
67 
68  bin_counts.resize(bin_count);
69  this->range_max = range_max;
70  this->range_min = range_min;
71  }
72 
73 
81  template<typename Dataset, enable_vector<Dataset> = true>
82  histogram(const Dataset& data, unsigned int bin_count = 0) {
83 
84  range_max = theoretica::max(data);
85  range_min = theoretica::min(data);
86  value_max = range_max;
87  value_min = range_min;
88  N = data.size();
89 
90  // Compute mean and TSS
91  run_average = stats::mean(data);
92  run_tss = stats::total_sum_squares(data);
93 
94  // Default bin count is sqrt(N)
95  bin_counts.resize(
96  bin_count ? bin_count : floor(sqrt(N))
97  );
98 
99  // The histogram contains all the data points by construction
100  for (size_t i = 0; i < N; ++i)
101  bin_counts[index(data[i])]++;
102  }
103 
104 
109  inline void insert(real x) {
110 
111  if(x < range_min || x > range_max)
112  return;
113 
114  // Update average and TSS using Welford's method
115  const real tmp = run_average;
116  run_average = tmp + (x - tmp) / (N + 1);
117  run_tss += (x - tmp) * (x - run_average);
118 
119  value_max = value_max < x ? x : value_max;
120  value_min = value_min > x ? x : value_min;
121 
122  bin_counts[index(x)]++;
123  N++;
124  }
125 
126 
135  inline unsigned int index(real x) const {
136 
137  if(abs(x - range_max) < MACH_EPSILON)
138  return bin_counts.size() - 1;
139 
140  return floor(
141  (x - range_min) / (range_max - range_min)
142  * bin_counts.size()
143  );
144  }
145 
146 
147  // Statistical functions
148 
149 
154  inline unsigned int number() const {
155  return N;
156  }
157 
158 
165  inline std::vector<unsigned int> bins() const {
166  return bin_counts;
167  }
168 
169 
173  inline real max() const {
174  return value_max;
175  }
176 
177 
181  inline real min() const {
182  return value_min;
183  }
184 
185 
189  inline real mean() const {
190  return run_average;
191  }
192 
193 
198  inline real tss() const {
199  return run_tss;
200  }
201 
202 
203  // Operators
204 
205 
211  inline real operator()(real x) {
212 
213  if (x < range_min || x > range_max)
214  return 0.0;
215 
216  return bin_counts[index(x)];
217  }
218 
219 
224  inline unsigned int operator[](unsigned int i) const {
225  return bin_counts[i];
226  }
227 
228 
230 
231 
232 #ifndef THEORETICA_NO_PRINT
233 
242  inline std::string to_string(
243  const std::string& separator = " ",
244  bool normalized = true,
245  bool lower_extreme = false) const {
246 
247  if(N == 0)
248  return "";
249 
250  std::stringstream res;
251  const real width = abs(range_max - range_min) / bin_counts.size();
252  real mult = 0.5;
253 
254  if (lower_extreme)
255  mult = 0.0;
256 
257  for (size_t i = 0; i < bin_counts.size(); ++i) {
258 
259  res << (range_min + (i + mult) * width) << separator;
260 
261  if (normalized)
262  res << (bin_counts[i] / (real) N) << std::endl;
263  else
264  res << bin_counts[i] << std::endl;
265  }
266 
267  return res.str();
268  }
269 
270 
272  inline operator std::string() {
273  return to_string();
274  }
275 
276 
279  inline friend std::ostream& operator<<(
280  std::ostream& out, const histogram& obj) {
281  return out << obj.to_string();
282  }
283 
284 #endif
285 
286  };
287 
288 
289  // Statistical functions over elements of a histogram
290 
291 
292  namespace stats {
293 
294 
296  inline real mean(const histogram& h) {
297  return h.mean();
298  }
299 
300 
302  inline real tss(const histogram& h) {
303  return h.tss();
304  }
305 
306 
308  inline real variance(const histogram& h) {
309 
310  if (h.number() <= 1) {
311  TH_MATH_ERROR("variance", h.number(), DIV_BY_ZERO);
312  return nan();
313  }
314 
315  return h.tss() / (h.number() - 1);
316  }
317 
318 
320  inline real stdev(const histogram& h) {
321  return sqrt(variance(h));
322  }
323  }
324 
325 
327  inline real max(const histogram& h) {
328  return h.max();
329  }
330 
331 
333  inline real min(const histogram& h) {
334  return h.min();
335  }
336 
337 }
338 
339 #endif
Histogram class with running statistics, can be constructed from the parameters of the bins or from a...
Definition: histogram.h:28
histogram(unsigned int bin_count, real range_min, real range_max)
Construct the histogram from the number of bins and the range.
Definition: histogram.h:65
real min() const
Get the smallest data point of the histogram.
Definition: histogram.h:181
unsigned int index(real x) const
Find the bin index corresponding to a given data point.
Definition: histogram.h:135
real max() const
Get the biggest data point of the histogram.
Definition: histogram.h:173
real operator()(real x)
Evaluate the histogram like a step function which is zero outside the range of the histogram.
Definition: histogram.h:211
real tss() const
Get the total sum of squares (TSS) computed using Welford's one-pass method.
Definition: histogram.h:198
std::vector< unsigned int > bins() const
Get a vector containing the bin counts of each bin.
Definition: histogram.h:165
unsigned int operator[](unsigned int i) const
Get the number of elements in the i-th bin.
Definition: histogram.h:224
std::string to_string(const std::string &separator=" ", bool normalized=true, bool lower_extreme=false) const
TO-DO Cumulative Distribution Function.
Definition: histogram.h:242
unsigned int number() const
Get the number of data points inside the histogram.
Definition: histogram.h:154
histogram(const Dataset &data, unsigned int bin_count=0)
Construct the histogram from a set of data points, with the given number of bins.
Definition: histogram.h:82
void insert(real x)
Insert a new data point inside the histogram, updating the running statistics and the corresponding b...
Definition: histogram.h:109
real mean() const
Get the mean value of the histogram data.
Definition: histogram.h:189
friend std::ostream & operator<<(std::ostream &out, const histogram &obj)
Stream the histogram in string representation to an output stream (std::ostream)
Definition: histogram.h:279
#define TH_MATH_ERROR(F_NAME, VALUE, EXCEPTION)
TH_MATH_ERROR is a macro which throws exceptions or modifies errno (depending on which compiling opti...
Definition: error.h:219
real stdev(const histogram &h)
Compute the standard deviation of the values of a histogram.
Definition: histogram.h:320
real total_sum_squares(const Dataset &X)
Compute the total sum of squares (TSS) of a given dataset as using Welford's one-pass method.
Definition: statistics.h:116
real variance(const histogram &h)
Compute the variance of the values of a histogram.
Definition: histogram.h:308
real mean(const histogram &h)
Compute the mean of the values of a histogram.
Definition: histogram.h:296
real tss(const histogram &h)
Compute the total sum of squares of the values of the histogram.
Definition: histogram.h:302
Main namespace of the library which contains all functions and objects.
Definition: algebra.h:27
double real
A real number, defined as a floating point type.
Definition: constants.h:198
auto min(const Vector &X)
Finds the minimum value inside a dataset.
Definition: dataset.h:351
dual2 sqrt(dual2 x)
Compute the square root of a second order dual number.
Definition: dual2_functions.h:54
real inf()
Return positive infinity in floating point representation.
Definition: error.h:76
dual2 abs(dual2 x)
Compute the absolute value of a second order dual number.
Definition: dual2_functions.h:198
auto max(const Vector &X)
Finds the maximum value inside a dataset.
Definition: dataset.h:330
constexpr real MACH_EPSILON
Machine epsilon for the real type.
Definition: constants.h:207
real nan()
Return a quiet NaN number in floating point representation.
Definition: error.h:54
TH_CONSTEXPR int floor(real x)
Compute the floor of x Computes the maximum integer number that is smaller than x.
Definition: real_analysis.h:271
Statistical functions.