theoretica/statistics_8h_source.html

#ifndef THEORETICA_STATISTICS_H

#define THEORETICA_STATISTICS_H


#include "../core/constants.h"

#include "../core/real_analysis.h"

#include "../core/special.h"

#include "../calculus/integral.h"

#include "../calculus/gauss.h"

#include "../core/dataset.h"


namespace theoretica {


    namespace stats {


        template<typename Dataset>


        inline real mean(const Dataset& X) {

            return arithmetic_mean(X);

        }


        template<typename Dataset>


        inline real range(const Dataset& X) {


            return max(X) - min(X);

        }


        template<typename Dataset>


        inline real semidispersion(const Dataset& X) {


            return range(X) / 2.0;

        }


        template<typename Dataset>


        inline real propagate_sum(const Dataset& sigma) {


            return sqrt(sum_squares(sigma));

        }


        template<typename Dataset1, typename Dataset2>


        inline real propagate_product(const Dataset1& sigma, const Dataset2& mean) {


            if(sigma.size() != mean.size()) {

                TH_MATH_ERROR("propagate_product", sigma.size(), INVALID_ARGUMENT);

                return nan();

            }


            // Compute sum of squares of (i_sigma / i_mean)

            real s = 0;

            for (unsigned int i = 0; i < sigma.size(); ++i) {


                if(mean[i] == 0) {

                    TH_MATH_ERROR("propagate_product", mean[i], DIV_BY_ZERO);

                    return nan();

                }


                s += square(sigma[i] / abs(mean[i]));

            }


            return sqrt(s);

        }


        template<typename Dataset>


        inline real total_sum_squares(const Dataset& X) {


            if(!X.size()) {

                TH_MATH_ERROR("total_sum_squares", X.size(), INVALID_ARGUMENT);

                return nan();

            }


            // Running average

            real avg = X[0];


            // Total sum

            real s = 0.0;


            for (size_t i = 1; i < X.size(); ++i) {


                const real tmp = avg;


                avg = tmp + (X[i] - tmp) / (i + 1);

                s += (X[i] - tmp) * (X[i] - avg);

            }


            return s;

        }


        template<typename Dataset>


        inline real variance(const Dataset& X, unsigned int constraints = 1) {


            if(X.size() <= constraints) {

                TH_MATH_ERROR("variance", X.size(), INVALID_ARGUMENT);

                return nan();

            }


            return total_sum_squares(X) / (X.size() - constraints);

        }


        template<typename Dataset>


        inline void moments2(

            const Dataset& X, real& out_mean,

            real& out_variance, unsigned int constraints = 1) {


            if(X.size() <= constraints) {

                TH_MATH_ERROR("total_sum_squares", X.size(), INVALID_ARGUMENT);

                out_mean = nan();

                out_variance = nan();

                return;

            }


            // Running average

            real avg = X[0];


            // Total sum

            real tss = 0.0;


            for (size_t i = 1; i < X.size(); ++i) {


                const real tmp = avg;


                avg = tmp + (X[i] - tmp) / (i + 1);

                tss += (X[i] - tmp) * (X[i] - avg);

            }


            out_mean = avg;

            out_variance = tss / (X.size() - constraints);

        }


        template<typename Dataset>


        inline real stdev(const Dataset& data, unsigned int constraints = 1) {

            return sqrt(variance(data, constraints));

        }


        template<typename Dataset>


        inline real stdom(const Dataset& X) {

            return sqrt(variance(X) / X.size());

        }


        template<typename Dataset>


        inline real standard_relative_error(const Dataset& X) {


            real x_mean = mean(X);


            if(abs(x_mean) < MACH_EPSILON) {

                TH_MATH_ERROR("standard_relative_error", x_mean, DIV_BY_ZERO);

                return nan();

            }


            return stdom(X) / abs(x_mean);

        }


        template<typename Dataset1, typename Dataset2>


        inline real covariance(

            const Dataset1& X, const Dataset2& Y, unsigned int constraints = 1) {


            if(X.size() != Y.size() || X.size() <= constraints) {

                TH_MATH_ERROR("covariance", X.size(), INVALID_ARGUMENT);

                return nan();

            }


            real s = 0;

            real X_mean = mean(X);

            real Y_mean = mean(Y);


            for (unsigned int i = 0; i < X.size(); ++i)

                s += (X[i] - X_mean) * (Y[i] - Y_mean);


            return s / (X.size() - constraints);

        }


        template<typename Dataset1, typename Dataset2>


        inline real correlation_coefficient(

            const Dataset1& X, const Dataset2& Y) {


            return covariance(X, Y) / (stdev(X) * stdev(Y));

        }


        template<typename Dataset>


        inline real autocorrelation(const Dataset& X, unsigned int n = 1) {


            if(X.size() < n) {

                TH_MATH_ERROR("autocorrelation", X.size(), INVALID_ARGUMENT);

                return nan();

            }


            const real mu = mean(X);

            real num = 0;

            real den = square(X[0] - mu);


            for (unsigned int i = n; i < X.size(); ++i) {


                const real delta = X[i] - mu;

                num += delta * (X[i - n] - mu);

                den += delta * delta;

            }


            return num / den;

        }


        template<typename Dataset>


        inline real absolute_deviation(const Dataset& X) {


            real mu = mean(X);

            real res = 0;


            for (real x : X)

                res += abs(x - mu);


            return res / X.size();

        }


        template<typename Dataset>


        inline real skewness(const Dataset& X) {


            real mu, sigma;

            real res = 0;


            moments2(X, mu, sigma);

            sigma = sqrt(sigma);


            for (real x : X)

                res += cube((x - mu) / sigma);


            return res / X.size();

        }


        template<typename Dataset>


        inline real kurtosis(const Dataset& X) {


            real mu, sigma;

            real res = 0;


            moments2(X, mu, sigma);

            sigma = sqrt(sigma);


            for (real x : X)

                res += pow((x - mu) / sigma, 4);


            return (res / X.size()) - 3;

        }


        template<typename RealFunction>


        inline real gaussian_expectation(RealFunction g, real mean, real sigma) {


            return integral_hermite(

                [=](real x) {

                    return g(SQRT2 * sigma * x + mean);

                }

            ) / SQRTPI;

        }


        inline real z_score(real x, real mean, real sigma) {


            return (x - mean) / sigma;

        }


        template<typename Dataset>


        inline Dataset normalize_z_score(const Dataset& X) {


            real mu, sigma;

            moments2(X, mu, sigma);

            sigma = sqrt(sigma);


            return map([mu, sigma](real x) { return z_score(x, mu, sigma); }, X);

        }


        template<typename Dataset1, typename Dataset2, typename Dataset3>


        inline real chi_square(

            const Dataset1& O, const Dataset2& E, const Dataset3& sigma) {


            if(O.size() != E.size() || E.size() != sigma.size()) {

                TH_MATH_ERROR("chi_square", E.size(), INVALID_ARGUMENT);

                return nan();

            }


            real c_sqr = 0;


            for (unsigned int i = 0; i < O.size(); ++i) {


                if(abs(sigma[i]) < MACH_EPSILON) {

                    TH_MATH_ERROR("chi_square", sigma[i], DIV_BY_ZERO);

                    return nan();

                }


                c_sqr += square((O[i] - E[i]) / sigma[i]);

            }


            return c_sqr;

        }


        inline real pvalue_chi_squared(real chi_sqr, unsigned int ndf) {


            if(ndf == 0) {

                TH_MATH_ERROR("pvalue_chi_squared", ndf, INVALID_ARGUMENT);

                return nan();

            }


            // For ndf >= 260 use the Gaussian approximation

            // as the coefficients are not stable

            if(ndf >= 260) {


                const real new_x = (chi_sqr - ndf) / sqrt(2.0 * ndf);


                // For really low Chi-squared the Gaussian is

                // below tolerance value for integration

                if(new_x < 0) {


                    if(new_x < -3)

                        return 1 - integral_inf_riemann([=](real x) {

                            return exp(-x * x / 2) / SQRTPI / SQRT2;

                        }, -new_x, 1E-16, 25);


                    return 0.5 + integral_romberg_tol([=](real x) {

                        return exp(-x * x / 2) / SQRTPI / SQRT2;

                    }, new_x, 0, 1E-16);

                } else {


                    if(new_x > 3)

                        return integral_inf_riemann([=](real x) {

                            return exp(-x * x / 2) / SQRTPI / SQRT2;

                        }, new_x, 1E-16, 25);


                    return 0.5 - integral_romberg_tol([=](real x) {

                        return exp(-x * x / 2) / SQRTPI / SQRT2;

                    }, 0, new_x, 1E-16);

                }

            }


            // Compute the coefficient using a stable equivalent formula

            const real coeff = exp(-special::lngamma(ndf / 2.0) - chi_sqr / 2.0);


            // Use different methods when Gauss-Laguerre is not numerically stable

            if((ndf > 70 && chi_sqr < (ndf / 2.0))) {


                // Use equivalent formula around potential singularity

                real res = integral_romberg_tol([=](real x) {

                    return pow(sqrt(x + chi_sqr / 2), ndf - 2) * exp(-x);

                }, 0, 1, 1E-12);


                res += integral_inf_riemann([=](real x) {

                    return exp((ndf - 2) / 2.0 * ln(x + chi_sqr / 2) - x);

                }, 1, ndf / 2, 1E-12, 25);


                return coeff * res;

            }


            // Approximate the integral using Gauss-Laguerre quadrature

            return coeff * integral_gauss(

                [=](real x) {

                    return pow(sqrt(x + chi_sqr / 2), ndf - 2);

            }, tables::laguerre_roots_16, tables::laguerre_weights_16, 16);

        }


        template<typename Dataset1, typename Dataset2, typename Dataset3>


        inline real chi_square_linear(

            const Dataset1& X, const Dataset2& Y,

            const Dataset3& sigma, real intercept, real slope) {


            if(X.size() != Y.size() || X.size() != sigma.size()) {

                TH_MATH_ERROR(

                    "chi_square_linear",

                    X.size(), INVALID_ARGUMENT);

                return nan();

            }


            real chi_squared = 0;

            for (unsigned int i = 0; i < X.size(); ++i) {


                if(abs(sigma[i]) <= MACH_EPSILON) {

                    TH_MATH_ERROR("chi_square_linear", sigma[i], DIV_BY_ZERO);

                    return nan();

                }


                chi_squared += square((Y[i] - intercept - slope * X[i]) / sigma[i]);

            }


            return chi_squared;

        }


        template<typename Dataset1, typename Dataset2, typename Dataset3>


        inline real reduced_chi_square_linear(

            const Dataset1& X, const Dataset2& Y,

            const Dataset3& sigma, real intercept, real slope) {


            if(Y.size() <= 2) {

                TH_MATH_ERROR("reduced_chi_square_linear",

                    Y.size(), INVALID_ARGUMENT);

                return nan();

            }


            // Divide by degrees of freedom (N - 2)

            return chi_square_linear(X, Y, sigma, intercept, slope)

                / (real) (Y.size() - 2);

        }


    }

}


#endif

TH_MATH_ERROR
#define TH_MATH_ERROR(F_NAME, VALUE, EXCEPTION)
TH_MATH_ERROR is a macro which throws exceptions or modifies errno (depending on which compiling opti...
Definition error.h:225

theoretica::special::lngamma
real lngamma(real x)
Log Gamma special function of real argument.
Definition special.h:59

theoretica::stats::pvalue_chi_squared
real pvalue_chi_squared(real chi_sqr, unsigned int ndf)
Compute the (right-tailed) p-value associated to a computed Chi-square value as the integral of the C...
Definition statistics.h:489

theoretica::stats::moments2
void moments2(const Dataset &X, real &out_mean, real &out_variance, unsigned int constraints=1)
Compute the mean and the variance of a dataset in a single pass, using Welford's method,...
Definition statistics.h:172

theoretica::stats::semidispersion
real semidispersion(const Dataset &X)
Computes the maximum semidispersion of a data set defined as .
Definition statistics.h:54

theoretica::stats::stdev
real stdev(const histogram &h)
Compute the standard deviation of the values of a histogram.
Definition histogram.h:320

theoretica::stats::normalize_z_score
Dataset normalize_z_score(const Dataset &X)
Normalize a data set using Z-score normalization.
Definition statistics.h:431

theoretica::stats::autocorrelation
real autocorrelation(const Dataset &X, unsigned int n=1)
Compute the lag-n autocorrelation of a dataset as .
Definition statistics.h:305

theoretica::stats::covariance
real covariance(const Dataset1 &X, const Dataset2 &Y, unsigned int constraints=1)
Compute the covariance between two datasets with the given number of constraints.
Definition statistics.h:262

theoretica::stats::chi_square_linear
real chi_square_linear(const Dataset1 &X, const Dataset2 &Y, const Dataset3 &sigma, real intercept, real slope)
Compute the chi-square on a linear regression, as the sum of the squares of the residuals divided by ...
Definition statistics.h:566

theoretica::stats::propagate_product
real propagate_product(const Dataset1 &sigma, const Dataset2 &mean)
Propagate the error over a product of random variables under quadrature, as , where each  corresponds...
Definition statistics.h:86

theoretica::stats::standard_relative_error
real standard_relative_error(const Dataset &X)
Compute the relative error on a dataset using estimates of its mean and standard deviation,...
Definition statistics.h:239

theoretica::stats::total_sum_squares
real total_sum_squares(const Dataset &X)
Compute the total sum of squares (TSS) of a given dataset as  using Welford's one-pass method.
Definition statistics.h:116

theoretica::stats::variance
real variance(const histogram &h)
Compute the variance of the values of a histogram.
Definition histogram.h:308

theoretica::stats::absolute_deviation
real absolute_deviation(const Dataset &X)
Compute the mean absolute deviation of a dataset as .
Definition statistics.h:334

theoretica::stats::gaussian_expectation
real gaussian_expectation(RealFunction g, real mean, real sigma)
Compute the expectation value of a given function with respect to a Gaussian distribution with the gi...
Definition statistics.h:401

theoretica::stats::stdom
real stdom(const Dataset &X)
Compute the standard deviation of the mean given a dataset.
Definition statistics.h:224

theoretica::stats::propagate_sum
real propagate_sum(const Dataset &sigma)
Propagate the error over a sum of random variables under quadrature, as , where each  corresponds to ...
Definition statistics.h:68

theoretica::stats::range
real range(const Dataset &X)
Computes the range of a data set, defined as .
Definition statistics.h:41

theoretica::stats::mean
real mean(const histogram &h)
Compute the mean of the values of a histogram.
Definition histogram.h:296

theoretica::stats::skewness
real skewness(const Dataset &X)
Compute the skewness of a dataset as .
Definition statistics.h:353

theoretica::stats::kurtosis
real kurtosis(const Dataset &X)
Compute the normalized kurtosis of a dataset as .
Definition statistics.h:375

theoretica::stats::chi_square
real chi_square(const Dataset1 &O, const Dataset2 &E, const Dataset3 &sigma)
Compute the chi-square from the set of observed quantities, expected quantities and errors.
Definition statistics.h:453

theoretica::stats::tss
real tss(const histogram &h)
Compute the total sum of squares of the values of the histogram.
Definition histogram.h:302

theoretica::stats::correlation_coefficient
real correlation_coefficient(const Dataset1 &X, const Dataset2 &Y)
Compute Pearson's correlation coefficient R between two datasets.
Definition statistics.h:290

theoretica::stats::z_score
real z_score(real x, real mean, real sigma)
Compute the Z-score of an observed value with respect to a Gaussian distribution with the given param...
Definition statistics.h:419

theoretica::stats::reduced_chi_square_linear
real reduced_chi_square_linear(const Dataset1 &X, const Dataset2 &Y, const Dataset3 &sigma, real intercept, real slope)
Compute the reduced chi-squared on a linear regression, computed as the usual chi-square (computed by...
Definition statistics.h:606

theoretica
Main namespace of the library which contains all functions and objects.
Definition algebra.h:27

theoretica::real
double real
A real number, defined as a floating point type.
Definition constants.h:198

theoretica::arithmetic_mean
real arithmetic_mean(const Dataset &data)
Compute the arithmetic mean of a set of values.
Definition dataset.h:375

theoretica::min
auto min(const Vector &X)
Finds the minimum value inside a dataset.
Definition dataset.h:351

theoretica::sqrt
dual2 sqrt(dual2 x)
Compute the square root of a second order dual number.
Definition dual2_functions.h:54

theoretica::map
Vector2 & map(Function f, const Vector1 &src, Vector2 &dest)
Get a new vector obtained by applying the function element-wise.
Definition dataset.h:266

theoretica::ln
dual2 ln(dual2 x)
Compute the natural logarithm of a second order dual number.
Definition dual2_functions.h:151

theoretica::abs
dual2 abs(dual2 x)
Compute the absolute value of a second order dual number.
Definition dual2_functions.h:198

theoretica::SQRTPI
constexpr real SQRTPI
The square root of Pi.
Definition constants.h:234

theoretica::exp
dual2 exp(dual2 x)
Compute the exponential of a second order dual number.
Definition dual2_functions.h:138

theoretica::integral_inf_riemann
real integral_inf_riemann(real_function f, real a, real step_sz=1, real tol=CALCULUS_INTEGRAL_TOL, unsigned int max_iter=100)
Integrate a function from a point up to infinity by integrating it by steps, stopping execution when ...
Definition integral.h:501

theoretica::max
auto max(const Vector &X)
Finds the maximum value inside a dataset.
Definition dataset.h:330

theoretica::nan
TH_CONSTEXPR real nan()
Return a quiet NaN number in floating point representation.
Definition error.h:54

theoretica::integral_gauss
real integral_gauss(RealFunction f, const std::vector< real > &x, const std::vector< real > &w)
Use Gaussian quadrature using the given points and weights.
Definition integral.h:234

theoretica::MACH_EPSILON
constexpr real MACH_EPSILON
Machine epsilon for the real type.
Definition constants.h:207

theoretica::E
constexpr real E
The Euler mathematical constant (e)
Definition constants.h:237

theoretica::integral_romberg_tol
real integral_romberg_tol(RealFunction f, real a, real b, real tolerance=CALCULUS_INTEGRAL_TOL)
Approximate the definite integral of an arbitrary function using Romberg's method to the given tolera...
Definition integral.h:197

theoretica::integral_hermite
real integral_hermite(RealFunction f, const std::vector< real > &x)
Use Gauss-Hermite quadrature of arbitrary degree to approximate an integral over (-inf,...
Definition integral.h:463

theoretica::SQRT2
constexpr real SQRT2
The square root of 2.
Definition constants.h:261

theoretica::square
dual2 square(dual2 x)
Return the square of a second order dual number.
Definition dual2_functions.h:23

theoretica::sum_squares
auto sum_squares(const Vector &X)
Sum the squares of a set of values.
Definition dataset.h:127

theoretica::pow
dual2 pow(dual2 x, int n)
Compute the n-th power of a second order dual number.
Definition dual2_functions.h:41

theoretica::cube
dual2 cube(dual2 x)
Return the cube of a second order dual number.
Definition dual2_functions.h:29