Theoretica
A C++ numerical and automatic mathematical library
statistics.h
Go to the documentation of this file.
1 
5 
6 #ifndef THEORETICA_STATISTICS_H
7 #define THEORETICA_STATISTICS_H
8 
9 #include "../core/constants.h"
10 #include "../core/real_analysis.h"
11 #include "../core/special.h"
12 #include "../calculus/integral.h"
13 #include "../calculus/gauss.h"
14 #include "../core/dataset.h"
15 
16 
17 namespace theoretica {
18 
19 
21  namespace stats {
22 
23 
29  template<typename Dataset>
30  inline real mean(const Dataset& X) {
31  return arithmetic_mean(X);
32  }
33 
34 
40  template<typename Dataset>
41  inline real range(const Dataset& X) {
42 
43  return max(X) - min(X);
44  }
45 
46 
53  template<typename Dataset>
54  inline real semidispersion(const Dataset& X) {
55 
56  return range(X) / 2.0;
57  }
58 
59 
67  template<typename Dataset>
68  inline real propagate_sum(const Dataset& sigma) {
69 
70  return sqrt(sum_squares(sigma));
71  }
72 
73 
85  template<typename Dataset1, typename Dataset2>
86  inline real propagate_product(const Dataset1& sigma, const Dataset2& mean) {
87 
88  if(sigma.size() != mean.size()) {
89  TH_MATH_ERROR("propagate_product", sigma.size(), INVALID_ARGUMENT);
90  return nan();
91  }
92 
93  // Compute sum of squares of (i_sigma / i_mean)
94  real s = 0;
95  for (unsigned int i = 0; i < sigma.size(); ++i) {
96 
97  if(mean[i] == 0) {
98  TH_MATH_ERROR("propagate_product", mean[i], DIV_BY_ZERO);
99  return nan();
100  }
101 
102  s += square(sigma[i] / abs(mean[i]));
103  }
104 
105  return sqrt(s);
106  }
107 
108 
115  template<typename Dataset>
116  inline real total_sum_squares(const Dataset& X) {
117 
118  if(!X.size()) {
119  TH_MATH_ERROR("total_sum_squares", X.size(), INVALID_ARGUMENT);
120  return nan();
121  }
122 
123  // Running average
124  real avg = X[0];
125 
126  // Total sum
127  real s = 0.0;
128 
129  for (size_t i = 1; i < X.size(); ++i) {
130 
131  const real tmp = avg;
132 
133  avg = tmp + (X[i] - tmp) / (i + 1);
134  s += (X[i] - tmp) * (X[i] - avg);
135  }
136 
137  return s;
138  }
139 
140 
150  template<typename Dataset>
151  inline real variance(const Dataset& X, unsigned int constraints = 1) {
152 
153  if(X.size() <= constraints) {
154  TH_MATH_ERROR("variance", X.size(), INVALID_ARGUMENT);
155  return nan();
156  }
157 
158  return total_sum_squares(X) / (X.size() - constraints);
159  }
160 
161 
171  template<typename Dataset>
172  inline void moments2(
173  const Dataset& X, real& out_mean,
174  real& out_variance, unsigned int constraints = 1) {
175 
176  if(X.size() <= constraints) {
177  TH_MATH_ERROR("total_sum_squares", X.size(), INVALID_ARGUMENT);
178  out_mean = nan();
179  out_variance = nan();
180  return;
181  }
182 
183  // Running average
184  real avg = X[0];
185 
186  // Total sum
187  real tss = 0.0;
188 
189  for (size_t i = 1; i < X.size(); ++i) {
190 
191  const real tmp = avg;
192 
193  avg = tmp + (X[i] - tmp) / (i + 1);
194  tss += (X[i] - tmp) * (X[i] - avg);
195  }
196 
197  out_mean = avg;
198  out_variance = tss / (X.size() - constraints);
199  }
200 
201 
211  template<typename Dataset>
212  inline real stdev(const Dataset& data, unsigned int constraints = 1) {
213  return sqrt(variance(data, constraints));
214  }
215 
216 
223  template<typename Dataset>
224  inline real stdom(const Dataset& X) {
225  return sqrt(variance(X) / X.size());
226  }
227 
228 
238  template<typename Dataset>
239  inline real standard_relative_error(const Dataset& X) {
240 
241  real x_mean = mean(X);
242 
243  if(abs(x_mean) < MACH_EPSILON) {
244  TH_MATH_ERROR("standard_relative_error", x_mean, DIV_BY_ZERO);
245  return nan();
246  }
247 
248  return stdom(X) / abs(x_mean);
249  }
250 
251 
261  template<typename Dataset1, typename Dataset2>
262  inline real covariance(
263  const Dataset1& X, const Dataset2& Y, unsigned int constraints = 1) {
264 
265  if(X.size() != Y.size() || X.size() <= constraints) {
266  TH_MATH_ERROR("covariance", X.size(), INVALID_ARGUMENT);
267  return nan();
268  }
269 
270  real s = 0;
271  real X_mean = mean(X);
272  real Y_mean = mean(Y);
273 
274  for (unsigned int i = 0; i < X.size(); ++i)
275  s += (X[i] - X_mean) * (Y[i] - Y_mean);
276 
277  return s / (X.size() - constraints);
278  }
279 
280 
289  template<typename Dataset1, typename Dataset2>
291  const Dataset1& X, const Dataset2& Y) {
292 
293  return covariance(X, Y) / (stdev(X) * stdev(Y));
294  }
295 
296 
304  template<typename Dataset>
305  inline real autocorrelation(const Dataset& X, unsigned int n = 1) {
306 
307  if(X.size() < n) {
308  TH_MATH_ERROR("autocorrelation", X.size(), INVALID_ARGUMENT);
309  return nan();
310  }
311 
312  const real mu = mean(X);
313  real num = 0;
314  real den = square(X[0] - mu);
315 
316  for (unsigned int i = n; i < X.size(); ++i) {
317 
318  const real delta = X[i] - mu;
319  num += delta * (X[i - n] - mu);
320  den += delta * delta;
321  }
322 
323  return num / den;
324  }
325 
326 
333  template<typename Dataset>
334  inline real absolute_deviation(const Dataset& X) {
335 
336  real mu = mean(X);
337  real res = 0;
338 
339  for (real x : X)
340  res += abs(x - mu);
341 
342  return res / X.size();
343  }
344 
345 
352  template<typename Dataset>
353  inline real skewness(const Dataset& X) {
354 
355  real mu, sigma;
356  real res = 0;
357 
358  moments2(X, mu, sigma);
359  sigma = sqrt(sigma);
360 
361  for (real x : X)
362  res += cube((x - mu) / sigma);
363 
364  return res / X.size();
365  }
366 
367 
374  template<typename Dataset>
375  inline real kurtosis(const Dataset& X) {
376 
377  real mu, sigma;
378  real res = 0;
379 
380  moments2(X, mu, sigma);
381  sigma = sqrt(sigma);
382 
383  for (real x : X)
384  res += pow((x - mu) / sigma, 4);
385 
386  return (res / X.size()) - 3;
387  }
388 
389 
400  template<typename RealFunction>
401  inline real gaussian_expectation(RealFunction g, real mean, real sigma) {
402 
403  return integral_hermite(
404  [=](real x) {
405  return g(SQRT2 * sigma * x + mean);
406  }
407  ) / SQRTPI;
408  }
409 
410 
419  inline real z_score(real x, real mean, real sigma) {
420 
421  return (x - mean) / sigma;
422  }
423 
424 
430  template<typename Dataset>
431  inline Dataset normalize_z_score(const Dataset& X) {
432 
433  real mu, sigma;
434  moments2(X, mu, sigma);
435  sigma = sqrt(sigma);
436 
437  return map([mu, sigma](real x) { return z_score(x, mu, sigma); }, X);
438  }
439 
440 
452  template<typename Dataset1, typename Dataset2, typename Dataset3>
453  inline real chi_square(
454  const Dataset1& O, const Dataset2& E, const Dataset3& sigma) {
455 
456  if(O.size() != E.size() || E.size() != sigma.size()) {
457  TH_MATH_ERROR("chi_square", E.size(), INVALID_ARGUMENT);
458  return nan();
459  }
460 
461  real c_sqr = 0;
462 
463  for (unsigned int i = 0; i < O.size(); ++i) {
464 
465  if(abs(sigma[i]) < MACH_EPSILON) {
466  TH_MATH_ERROR("chi_square", sigma[i], DIV_BY_ZERO);
467  return nan();
468  }
469 
470  c_sqr += square((O[i] - E[i]) / sigma[i]);
471  }
472 
473  return c_sqr;
474  }
475 
476 
489  inline real pvalue_chi_squared(real chi_sqr, unsigned int ndf) {
490 
491  if(ndf == 0) {
492  TH_MATH_ERROR("pvalue_chi_squared", ndf, INVALID_ARGUMENT);
493  return nan();
494  }
495 
496  // For ndf >= 260 use the Gaussian approximation
497  // as the coefficients are not stable
498  if(ndf >= 260) {
499 
500  const real new_x = (chi_sqr - ndf) / sqrt(2.0 * ndf);
501 
502  // For really low Chi-squared the Gaussian is
503  // below tolerance value for integration
504  if(new_x < 0) {
505 
506  if(new_x < -3)
507  return 1 - integral_inf_riemann([=](real x) {
508  return exp(-x * x / 2) / SQRTPI / SQRT2;
509  }, -new_x, 1E-16, 25);
510 
511  return 0.5 + integral_romberg_tol([=](real x) {
512  return exp(-x * x / 2) / SQRTPI / SQRT2;
513  }, new_x, 0, 1E-16);
514  } else {
515 
516  if(new_x > 3)
517  return integral_inf_riemann([=](real x) {
518  return exp(-x * x / 2) / SQRTPI / SQRT2;
519  }, new_x, 1E-16, 25);
520 
521  return 0.5 - integral_romberg_tol([=](real x) {
522  return exp(-x * x / 2) / SQRTPI / SQRT2;
523  }, 0, new_x, 1E-16);
524  }
525  }
526 
527  // Compute the coefficient using a stable equivalent formula
528  const real coeff = exp(-special::lngamma(ndf / 2.0) - chi_sqr / 2.0);
529 
530  // Use different methods when Gauss-Laguerre is not numerically stable
531  if((ndf > 70 && chi_sqr < (ndf / 2.0))) {
532 
533  // Use equivalent formula around potential singularity
534  real res = integral_romberg_tol([=](real x) {
535  return pow(sqrt(x + chi_sqr / 2), ndf - 2) * exp(-x);
536  }, 0, 1, 1E-12);
537 
538  res += integral_inf_riemann([=](real x) {
539  return exp((ndf - 2) / 2.0 * ln(x + chi_sqr / 2) - x);
540  }, 1, ndf / 2, 1E-12, 25);
541 
542  return coeff * res;
543  }
544 
545  // Approximate the integral using Gauss-Laguerre quadrature
546  return coeff * integral_gauss(
547  [=](real x) {
548  return pow(sqrt(x + chi_sqr / 2), ndf - 2);
549  }, tables::laguerre_roots_16, tables::laguerre_weights_16, 16);
550  }
551 
552 
565  template<typename Dataset1, typename Dataset2, typename Dataset3>
567  const Dataset1& X, const Dataset2& Y,
568  const Dataset3& sigma, real intercept, real slope) {
569 
570  if(X.size() != Y.size() || X.size() != sigma.size()) {
572  "chi_square_linear",
573  X.size(), INVALID_ARGUMENT);
574  return nan();
575  }
576 
577  real chi_squared = 0;
578  for (unsigned int i = 0; i < X.size(); ++i) {
579 
580  if(abs(sigma[i]) <= MACH_EPSILON) {
581  TH_MATH_ERROR("chi_square_linear", sigma[i], DIV_BY_ZERO);
582  return nan();
583  }
584 
585  chi_squared += square((Y[i] - intercept - slope * X[i]) / sigma[i]);
586  }
587 
588  return chi_squared;
589  }
590 
591 
605  template<typename Dataset1, typename Dataset2, typename Dataset3>
607  const Dataset1& X, const Dataset2& Y,
608  const Dataset3& sigma, real intercept, real slope) {
609 
610  if(Y.size() <= 2) {
611  TH_MATH_ERROR("reduced_chi_square_linear",
612  Y.size(), INVALID_ARGUMENT);
613  return nan();
614  }
615 
616  // Divide by degrees of freedom (N - 2)
617  return chi_square_linear(X, Y, sigma, intercept, slope)
618  / (real) (Y.size() - 2);
619  }
620  }
621 }
622 
623 #endif
#define TH_MATH_ERROR(F_NAME, VALUE, EXCEPTION)
TH_MATH_ERROR is a macro which throws exceptions or modifies errno (depending on which compiling opti...
Definition: error.h:219
real chi_squared(real x, unsigned int k)
Chi-squared distribution.
Definition: distributions.h:162
real lngamma(real x)
Log Gamma special function of real argument.
Definition: special.h:59
real pvalue_chi_squared(real chi_sqr, unsigned int ndf)
Compute the (right-tailed) p-value associated to a computed Chi-square value as the integral of the C...
Definition: statistics.h:489
void moments2(const Dataset &X, real &out_mean, real &out_variance, unsigned int constraints=1)
Compute the mean and the variance of a dataset in a single pass, using Welford's method,...
Definition: statistics.h:172
real semidispersion(const Dataset &X)
Computes the maximum semidispersion of a data set defined as .
Definition: statistics.h:54
real stdev(const histogram &h)
Compute the standard deviation of the values of a histogram.
Definition: histogram.h:320
Dataset normalize_z_score(const Dataset &X)
Normalize a data set using Z-score normalization.
Definition: statistics.h:431
real autocorrelation(const Dataset &X, unsigned int n=1)
Compute the lag-n autocorrelation of a dataset as .
Definition: statistics.h:305
real covariance(const Dataset1 &X, const Dataset2 &Y, unsigned int constraints=1)
Compute the covariance between two datasets with the given number of constraints.
Definition: statistics.h:262
real chi_square_linear(const Dataset1 &X, const Dataset2 &Y, const Dataset3 &sigma, real intercept, real slope)
Compute the chi-square on a linear regression, as the sum of the squares of the residuals divided by ...
Definition: statistics.h:566
real propagate_product(const Dataset1 &sigma, const Dataset2 &mean)
Propagate the error over a product of random variables under quadrature, as , where each corresponds...
Definition: statistics.h:86
real standard_relative_error(const Dataset &X)
Compute the relative error on a dataset using estimates of its mean and standard deviation,...
Definition: statistics.h:239
real total_sum_squares(const Dataset &X)
Compute the total sum of squares (TSS) of a given dataset as using Welford's one-pass method.
Definition: statistics.h:116
real variance(const histogram &h)
Compute the variance of the values of a histogram.
Definition: histogram.h:308
real absolute_deviation(const Dataset &X)
Compute the mean absolute deviation of a dataset as .
Definition: statistics.h:334
real gaussian_expectation(RealFunction g, real mean, real sigma)
Compute the expectation value of a given function with respect to a Gaussian distribution with the gi...
Definition: statistics.h:401
real stdom(const Dataset &X)
Compute the standard deviation of the mean given a dataset.
Definition: statistics.h:224
real propagate_sum(const Dataset &sigma)
Propagate the error over a sum of random variables under quadrature, as , where each corresponds to ...
Definition: statistics.h:68
real range(const Dataset &X)
Computes the range of a data set, defined as .
Definition: statistics.h:41
real mean(const histogram &h)
Compute the mean of the values of a histogram.
Definition: histogram.h:296
real skewness(const Dataset &X)
Compute the skewness of a dataset as .
Definition: statistics.h:353
real kurtosis(const Dataset &X)
Compute the normalized kurtosis of a dataset as .
Definition: statistics.h:375
real chi_square(const Dataset1 &O, const Dataset2 &E, const Dataset3 &sigma)
Compute the chi-square from the set of observed quantities, expected quantities and errors.
Definition: statistics.h:453
real tss(const histogram &h)
Compute the total sum of squares of the values of the histogram.
Definition: histogram.h:302
real correlation_coefficient(const Dataset1 &X, const Dataset2 &Y)
Compute Pearson's correlation coefficient R between two datasets.
Definition: statistics.h:290
real z_score(real x, real mean, real sigma)
Compute the Z-score of an observed value with respect to a Gaussian distribution with the given param...
Definition: statistics.h:419
real reduced_chi_square_linear(const Dataset1 &X, const Dataset2 &Y, const Dataset3 &sigma, real intercept, real slope)
Compute the reduced chi-squared on a linear regression, computed as the usual chi-square (computed by...
Definition: statistics.h:606
Main namespace of the library which contains all functions and objects.
Definition: algebra.h:27
double real
A real number, defined as a floating point type.
Definition: constants.h:188
real arithmetic_mean(const Dataset &data)
Compute the arithmetic mean of a set of values.
Definition: dataset.h:375
auto min(const Vector &X)
Finds the minimum value inside a dataset.
Definition: dataset.h:351
dual2 sqrt(dual2 x)
Compute the square root of a second order dual number.
Definition: dual2_functions.h:54
dual2 ln(dual2 x)
Compute the natural logarithm of a second order dual number.
Definition: dual2_functions.h:139
dual2 abs(dual2 x)
Compute the absolute value of a second order dual number.
Definition: dual2_functions.h:183
constexpr real SQRTPI
The square root of Pi.
Definition: constants.h:224
dual2 exp(dual2 x)
Compute the exponential of a second order dual number.
Definition: dual2_functions.h:130
real integral_inf_riemann(real_function f, real a, real step_sz=1, real tol=CALCULUS_INTEGRAL_TOL, unsigned int max_iter=100)
Integrate a function from a point up to infinity by integrating it by steps, stopping execution when ...
Definition: integral.h:477
auto max(const Vector &X)
Finds the maximum value inside a dataset.
Definition: dataset.h:330
real integral_gauss(RealFunction f, const std::vector< real > &x, const std::vector< real > &w)
Use Gaussian quadrature using the given points and weights.
Definition: integral.h:210
constexpr real MACH_EPSILON
Machine epsilon for the real type.
Definition: constants.h:197
constexpr real E
The Euler mathematical constant (e)
Definition: constants.h:227
real integral_romberg_tol(RealFunction f, real a, real b, real tolerance=CALCULUS_INTEGRAL_TOL)
Approximate the definite integral of an arbitrary function using Romberg's method to the given tolera...
Definition: integral.h:173
real integral_hermite(RealFunction f, const std::vector< real > &x)
Use Gauss-Hermite quadrature of arbitrary degree to approximate an integral over (-inf,...
Definition: integral.h:439
Vector2 & map(Function f, const Vector1 &src, Vector2 &dest)
Get a new vector obtained by applying the function element-wise.
Definition: dataset.h:266
constexpr real SQRT2
The square root of 2.
Definition: constants.h:251
dual2 square(dual2 x)
Return the square of a second order dual number.
Definition: dual2_functions.h:23
real nan()
Return a quiet NaN number in floating point representation.
Definition: error.h:54
auto sum_squares(const Vector &X)
Sum the squares of a set of values.
Definition: dataset.h:127
dual2 pow(dual2 x, int n)
Compute the n-th power of a second order dual number.
Definition: dual2_functions.h:41
dual2 cube(dual2 x)
Return the cube of a second order dual number.
Definition: dual2_functions.h:29