Theoretica
A C++ numerical and automatic mathematical library
Loading...
Searching...
No Matches
statistics.h
Go to the documentation of this file.
1
5
6#ifndef THEORETICA_STATISTICS_H
7#define THEORETICA_STATISTICS_H
8
9#include "../core/constants.h"
10#include "../core/real_analysis.h"
11#include "../core/special.h"
12#include "../calculus/integral.h"
13#include "../calculus/gauss.h"
14#include "../core/dataset.h"
15
16
17namespace theoretica {
18
19
21 namespace stats {
22
23
29 template<typename Dataset>
30 inline real mean(const Dataset& X) {
31 return arithmetic_mean(X);
32 }
33
34
40 template<typename Dataset>
41 inline real range(const Dataset& X) {
42
43 return max(X) - min(X);
44 }
45
46
53 template<typename Dataset>
54 inline real semidispersion(const Dataset& X) {
55
56 return range(X) / 2.0;
57 }
58
59
67 template<typename Dataset>
69
70 return sqrt(sum_squares(sigma));
71 }
72
73
85 template<typename Dataset1, typename Dataset2>
87
88 if(sigma.size() != mean.size()) {
89 TH_MATH_ERROR("propagate_product", sigma.size(), INVALID_ARGUMENT);
90 return nan();
91 }
92
93 // Compute sum of squares of (i_sigma / i_mean)
94 real s = 0;
95 for (unsigned int i = 0; i < sigma.size(); ++i) {
96
97 if(mean[i] == 0) {
98 TH_MATH_ERROR("propagate_product", mean[i], DIV_BY_ZERO);
99 return nan();
100 }
101
102 s += square(sigma[i] / abs(mean[i]));
103 }
104
105 return sqrt(s);
106 }
107
108
115 template<typename Dataset>
117
118 if(!X.size()) {
119 TH_MATH_ERROR("total_sum_squares", X.size(), INVALID_ARGUMENT);
120 return nan();
121 }
122
123 // Running average
124 real avg = X[0];
125
126 // Total sum
127 real s = 0.0;
128
129 for (size_t i = 1; i < X.size(); ++i) {
130
131 const real tmp = avg;
132
133 avg = tmp + (X[i] - tmp) / (i + 1);
134 s += (X[i] - tmp) * (X[i] - avg);
135 }
136
137 return s;
138 }
139
140
150 template<typename Dataset>
151 inline real variance(const Dataset& X, unsigned int constraints = 1) {
152
153 if(X.size() <= constraints) {
154 TH_MATH_ERROR("variance", X.size(), INVALID_ARGUMENT);
155 return nan();
156 }
157
158 return total_sum_squares(X) / (X.size() - constraints);
159 }
160
161
171 template<typename Dataset>
172 inline void moments2(
173 const Dataset& X, real& out_mean,
174 real& out_variance, unsigned int constraints = 1) {
175
176 if(X.size() <= constraints) {
177 TH_MATH_ERROR("total_sum_squares", X.size(), INVALID_ARGUMENT);
178 out_mean = nan();
179 out_variance = nan();
180 return;
181 }
182
183 // Running average
184 real avg = X[0];
185
186 // Total sum
187 real tss = 0.0;
188
189 for (size_t i = 1; i < X.size(); ++i) {
190
191 const real tmp = avg;
192
193 avg = tmp + (X[i] - tmp) / (i + 1);
194 tss += (X[i] - tmp) * (X[i] - avg);
195 }
196
197 out_mean = avg;
198 out_variance = tss / (X.size() - constraints);
199 }
200
201
211 template<typename Dataset>
212 inline real stdev(const Dataset& data, unsigned int constraints = 1) {
213 return sqrt(variance(data, constraints));
214 }
215
216
223 template<typename Dataset>
224 inline real stdom(const Dataset& X) {
225 return sqrt(variance(X) / X.size());
226 }
227
228
238 template<typename Dataset>
240
241 real x_mean = mean(X);
242
243 if(abs(x_mean) < MACH_EPSILON) {
244 TH_MATH_ERROR("standard_relative_error", x_mean, DIV_BY_ZERO);
245 return nan();
246 }
247
248 return stdom(X) / abs(x_mean);
249 }
250
251
261 template<typename Dataset1, typename Dataset2>
263 const Dataset1& X, const Dataset2& Y, unsigned int constraints = 1) {
264
265 if(X.size() != Y.size() || X.size() <= constraints) {
266 TH_MATH_ERROR("covariance", X.size(), INVALID_ARGUMENT);
267 return nan();
268 }
269
270 real s = 0;
271 real X_mean = mean(X);
272 real Y_mean = mean(Y);
273
274 for (unsigned int i = 0; i < X.size(); ++i)
275 s += (X[i] - X_mean) * (Y[i] - Y_mean);
276
277 return s / (X.size() - constraints);
278 }
279
280
289 template<typename Dataset1, typename Dataset2>
291 const Dataset1& X, const Dataset2& Y) {
292
293 return covariance(X, Y) / (stdev(X) * stdev(Y));
294 }
295
296
304 template<typename Dataset>
305 inline real autocorrelation(const Dataset& X, unsigned int n = 1) {
306
307 if(X.size() < n) {
308 TH_MATH_ERROR("autocorrelation", X.size(), INVALID_ARGUMENT);
309 return nan();
310 }
311
312 const real mu = mean(X);
313 real num = 0;
314 real den = square(X[0] - mu);
315
316 for (unsigned int i = n; i < X.size(); ++i) {
317
318 const real delta = X[i] - mu;
319 num += delta * (X[i - n] - mu);
320 den += delta * delta;
321 }
322
323 return num / den;
324 }
325
326
333 template<typename Dataset>
335
336 real mu = mean(X);
337 real res = 0;
338
339 for (real x : X)
340 res += abs(x - mu);
341
342 return res / X.size();
343 }
344
345
352 template<typename Dataset>
353 inline real skewness(const Dataset& X) {
354
355 real mu, sigma;
356 real res = 0;
357
358 moments2(X, mu, sigma);
359 sigma = sqrt(sigma);
360
361 for (real x : X)
362 res += cube((x - mu) / sigma);
363
364 return res / X.size();
365 }
366
367
374 template<typename Dataset>
375 inline real kurtosis(const Dataset& X) {
376
377 real mu, sigma;
378 real res = 0;
379
380 moments2(X, mu, sigma);
381 sigma = sqrt(sigma);
382
383 for (real x : X)
384 res += pow((x - mu) / sigma, 4);
385
386 return (res / X.size()) - 3;
387 }
388
389
400 template<typename RealFunction>
402
403 return integral_hermite(
404 [=](real x) {
405 return g(SQRT2 * sigma * x + mean);
406 }
407 ) / SQRTPI;
408 }
409
410
420
421 return (x - mean) / sigma;
422 }
423
424
430 template<typename Dataset>
432
433 real mu, sigma;
434 moments2(X, mu, sigma);
435 sigma = sqrt(sigma);
436
437 return map([mu, sigma](real x) { return z_score(x, mu, sigma); }, X);
438 }
439
440
452 template<typename Dataset1, typename Dataset2, typename Dataset3>
454 const Dataset1& O, const Dataset2& E, const Dataset3& sigma) {
455
456 if(O.size() != E.size() || E.size() != sigma.size()) {
457 TH_MATH_ERROR("chi_square", E.size(), INVALID_ARGUMENT);
458 return nan();
459 }
460
461 real c_sqr = 0;
462
463 for (unsigned int i = 0; i < O.size(); ++i) {
464
465 if(abs(sigma[i]) < MACH_EPSILON) {
466 TH_MATH_ERROR("chi_square", sigma[i], DIV_BY_ZERO);
467 return nan();
468 }
469
470 c_sqr += square((O[i] - E[i]) / sigma[i]);
471 }
472
473 return c_sqr;
474 }
475
476
489 inline real pvalue_chi_squared(real chi_sqr, unsigned int ndf) {
490
491 if(ndf == 0) {
492 TH_MATH_ERROR("pvalue_chi_squared", ndf, INVALID_ARGUMENT);
493 return nan();
494 }
495
496 // For ndf >= 260 use the Gaussian approximation
497 // as the coefficients are not stable
498 if(ndf >= 260) {
499
500 const real new_x = (chi_sqr - ndf) / sqrt(2.0 * ndf);
501
502 // For really low Chi-squared the Gaussian is
503 // below tolerance value for integration
504 if(new_x < 0) {
505
506 if(new_x < -3)
507 return 1 - integral_inf_riemann([=](real x) {
508 return exp(-x * x / 2) / SQRTPI / SQRT2;
509 }, -new_x, 1E-16, 25);
510
511 return 0.5 + integral_romberg_tol([=](real x) {
512 return exp(-x * x / 2) / SQRTPI / SQRT2;
513 }, new_x, 0, 1E-16);
514 } else {
515
516 if(new_x > 3)
517 return integral_inf_riemann([=](real x) {
518 return exp(-x * x / 2) / SQRTPI / SQRT2;
519 }, new_x, 1E-16, 25);
520
521 return 0.5 - integral_romberg_tol([=](real x) {
522 return exp(-x * x / 2) / SQRTPI / SQRT2;
523 }, 0, new_x, 1E-16);
524 }
525 }
526
527 // Compute the coefficient using a stable equivalent formula
528 const real coeff = exp(-special::lngamma(ndf / 2.0) - chi_sqr / 2.0);
529
530 // Use different methods when Gauss-Laguerre is not numerically stable
531 if((ndf > 70 && chi_sqr < (ndf / 2.0))) {
532
533 // Use equivalent formula around potential singularity
535 return pow(sqrt(x + chi_sqr / 2), ndf - 2) * exp(-x);
536 }, 0, 1, 1E-12);
537
538 res += integral_inf_riemann([=](real x) {
539 return exp((ndf - 2) / 2.0 * ln(x + chi_sqr / 2) - x);
540 }, 1, ndf / 2, 1E-12, 25);
541
542 return coeff * res;
543 }
544
545 // Approximate the integral using Gauss-Laguerre quadrature
546 return coeff * integral_gauss(
547 [=](real x) {
548 return pow(sqrt(x + chi_sqr / 2), ndf - 2);
549 }, tables::laguerre_roots_16, tables::laguerre_weights_16, 16);
550 }
551
552
565 template<typename Dataset1, typename Dataset2, typename Dataset3>
567 const Dataset1& X, const Dataset2& Y,
569
570 if(X.size() != Y.size() || X.size() != sigma.size()) {
572 "chi_square_linear",
573 X.size(), INVALID_ARGUMENT);
574 return nan();
575 }
576
577 real chi_squared = 0;
578 for (unsigned int i = 0; i < X.size(); ++i) {
579
580 if(abs(sigma[i]) <= MACH_EPSILON) {
581 TH_MATH_ERROR("chi_square_linear", sigma[i], DIV_BY_ZERO);
582 return nan();
583 }
584
585 chi_squared += square((Y[i] - intercept - slope * X[i]) / sigma[i]);
586 }
587
588 return chi_squared;
589 }
590
591
605 template<typename Dataset1, typename Dataset2, typename Dataset3>
607 const Dataset1& X, const Dataset2& Y,
609
610 if(Y.size() <= 2) {
611 TH_MATH_ERROR("reduced_chi_square_linear",
612 Y.size(), INVALID_ARGUMENT);
613 return nan();
614 }
615
616 // Divide by degrees of freedom (N - 2)
618 / (real) (Y.size() - 2);
619 }
620 }
621}
622
623#endif
#define TH_MATH_ERROR(F_NAME, VALUE, EXCEPTION)
TH_MATH_ERROR is a macro which throws exceptions or modifies errno (depending on which compiling opti...
Definition error.h:219
real lngamma(real x)
Log Gamma special function of real argument.
Definition special.h:59
real pvalue_chi_squared(real chi_sqr, unsigned int ndf)
Compute the (right-tailed) p-value associated to a computed Chi-square value as the integral of the C...
Definition statistics.h:489
void moments2(const Dataset &X, real &out_mean, real &out_variance, unsigned int constraints=1)
Compute the mean and the variance of a dataset in a single pass, using Welford's method,...
Definition statistics.h:172
real semidispersion(const Dataset &X)
Computes the maximum semidispersion of a data set defined as .
Definition statistics.h:54
real stdev(const histogram &h)
Compute the standard deviation of the values of a histogram.
Definition histogram.h:320
Dataset normalize_z_score(const Dataset &X)
Normalize a data set using Z-score normalization.
Definition statistics.h:431
real autocorrelation(const Dataset &X, unsigned int n=1)
Compute the lag-n autocorrelation of a dataset as .
Definition statistics.h:305
real covariance(const Dataset1 &X, const Dataset2 &Y, unsigned int constraints=1)
Compute the covariance between two datasets with the given number of constraints.
Definition statistics.h:262
real chi_square_linear(const Dataset1 &X, const Dataset2 &Y, const Dataset3 &sigma, real intercept, real slope)
Compute the chi-square on a linear regression, as the sum of the squares of the residuals divided by ...
Definition statistics.h:566
real propagate_product(const Dataset1 &sigma, const Dataset2 &mean)
Propagate the error over a product of random variables under quadrature, as , where each corresponds...
Definition statistics.h:86
real standard_relative_error(const Dataset &X)
Compute the relative error on a dataset using estimates of its mean and standard deviation,...
Definition statistics.h:239
real total_sum_squares(const Dataset &X)
Compute the total sum of squares (TSS) of a given dataset as using Welford's one-pass method.
Definition statistics.h:116
real variance(const histogram &h)
Compute the variance of the values of a histogram.
Definition histogram.h:308
real absolute_deviation(const Dataset &X)
Compute the mean absolute deviation of a dataset as .
Definition statistics.h:334
real gaussian_expectation(RealFunction g, real mean, real sigma)
Compute the expectation value of a given function with respect to a Gaussian distribution with the gi...
Definition statistics.h:401
real stdom(const Dataset &X)
Compute the standard deviation of the mean given a dataset.
Definition statistics.h:224
real propagate_sum(const Dataset &sigma)
Propagate the error over a sum of random variables under quadrature, as , where each corresponds to ...
Definition statistics.h:68
real range(const Dataset &X)
Computes the range of a data set, defined as .
Definition statistics.h:41
real mean(const histogram &h)
Compute the mean of the values of a histogram.
Definition histogram.h:296
real skewness(const Dataset &X)
Compute the skewness of a dataset as .
Definition statistics.h:353
real kurtosis(const Dataset &X)
Compute the normalized kurtosis of a dataset as .
Definition statistics.h:375
real chi_square(const Dataset1 &O, const Dataset2 &E, const Dataset3 &sigma)
Compute the chi-square from the set of observed quantities, expected quantities and errors.
Definition statistics.h:453
real tss(const histogram &h)
Compute the total sum of squares of the values of the histogram.
Definition histogram.h:302
real correlation_coefficient(const Dataset1 &X, const Dataset2 &Y)
Compute Pearson's correlation coefficient R between two datasets.
Definition statistics.h:290
real z_score(real x, real mean, real sigma)
Compute the Z-score of an observed value with respect to a Gaussian distribution with the given param...
Definition statistics.h:419
real reduced_chi_square_linear(const Dataset1 &X, const Dataset2 &Y, const Dataset3 &sigma, real intercept, real slope)
Compute the reduced chi-squared on a linear regression, computed as the usual chi-square (computed by...
Definition statistics.h:606
Main namespace of the library which contains all functions and objects.
Definition algebra.h:27
double real
A real number, defined as a floating point type.
Definition constants.h:198
real arithmetic_mean(const Dataset &data)
Compute the arithmetic mean of a set of values.
Definition dataset.h:375
auto min(const Vector &X)
Finds the minimum value inside a dataset.
Definition dataset.h:351
dual2 sqrt(dual2 x)
Compute the square root of a second order dual number.
Definition dual2_functions.h:54
Vector2 & map(Function f, const Vector1 &src, Vector2 &dest)
Get a new vector obtained by applying the function element-wise.
Definition dataset.h:266
dual2 ln(dual2 x)
Compute the natural logarithm of a second order dual number.
Definition dual2_functions.h:151
dual2 abs(dual2 x)
Compute the absolute value of a second order dual number.
Definition dual2_functions.h:198
std::remove_reference_t< decltype(std::declval< Structure >()[0])> vector_element_t
Extract the type of a vector (or any indexable container) from its operator[].
Definition core_traits.h:134
constexpr real SQRTPI
The square root of Pi.
Definition constants.h:234
dual2 exp(dual2 x)
Compute the exponential of a second order dual number.
Definition dual2_functions.h:138
real integral_inf_riemann(real_function f, real a, real step_sz=1, real tol=CALCULUS_INTEGRAL_TOL, unsigned int max_iter=100)
Integrate a function from a point up to infinity by integrating it by steps, stopping execution when ...
Definition integral.h:501
auto max(const Vector &X)
Finds the maximum value inside a dataset.
Definition dataset.h:330
real integral_gauss(RealFunction f, const std::vector< real > &x, const std::vector< real > &w)
Use Gaussian quadrature using the given points and weights.
Definition integral.h:234
constexpr real MACH_EPSILON
Machine epsilon for the real type.
Definition constants.h:207
constexpr real E
The Euler mathematical constant (e)
Definition constants.h:237
real integral_romberg_tol(RealFunction f, real a, real b, real tolerance=CALCULUS_INTEGRAL_TOL)
Approximate the definite integral of an arbitrary function using Romberg's method to the given tolera...
Definition integral.h:197
real integral_hermite(RealFunction f, const std::vector< real > &x)
Use Gauss-Hermite quadrature of arbitrary degree to approximate an integral over (-inf,...
Definition integral.h:463
constexpr real SQRT2
The square root of 2.
Definition constants.h:261
dual2 square(dual2 x)
Return the square of a second order dual number.
Definition dual2_functions.h:23
real nan()
Return a quiet NaN number in floating point representation.
Definition error.h:54
auto sum_squares(const Vector &X)
Sum the squares of a set of values.
Definition dataset.h:127
dual2 pow(dual2 x, int n)
Compute the n-th power of a second order dual number.
Definition dual2_functions.h:41
dual2 cube(dual2 x)
Return the cube of a second order dual number.
Definition dual2_functions.h:29