// ================================================================
// Some simple statistics routines.
//
// John Kerl
// kerl.john.r@gmail.com
// 2007-06-28
//
// Ported from Python to Java 2010-02-18.
//
// ================================================================
// This software is released under the terms of the GNU GPL.
// Please see LICENSE.txt in the same directory as this file.
// ================================================================

import java.lang.Math;

public class Stats {

	// ----------------------------------------------------------------
	// Scalar mean
	public static double find_mean(double[] xs, int n) {
		double sum = 0.0;
		for (int i = 0; i < n; i++)
			sum += xs[i];
		return sum / n;
	}

	// ----------------------------------------------------------------
	// Scalar standard deviation
	public static double find_stddev(double[] xs, int n, double mean) {
		double deviant;
		double sum = 0.0;
		for (int i = 0; i < n; i++) {
			deviant = xs[i] - mean;
			sum += deviant * deviant;
		}
		return Math.sqrt(sum / (n - 1));
	}

	// ----------------------------------------------------------------
	// Standard error, i.e. standard deviation of the sample mean.
	public static double find_stderr(double[] xs, int n, double mean) {
		double s = find_stddev(xs, n, mean);
		return s / Math.sqrt(n);
	}

	// ----------------------------------------------------------------
	// Population variance
	public static double find_popvaraux(double[] xs, int n, double mean) {
		double sum = 0.0;
		double deviant;
		for (int i = 0; i < n; i++) {
			deviant = xs[i] - mean;
			sum += deviant * deviant;
		}
		return sum / n;
	}

	public static double find_popvar(double[] xs, int n) {
		return find_popvaraux(xs, n, find_mean(xs, n));
	}

	// ----------------------------------------------------------------
	// Sample variance
	public static double find_sampvaraux(double[] xs, int n, double mean) {
		double sum = 0.0;
		double deviant;
		for (int i = 0; i < n; i++) {
			deviant = xs[i] - mean;
			sum += deviant * deviant;
		}
		return sum / (n - 1);
	}

	public static double find_sampvar(double[] xs, int n) {
		return find_sampvaraux(xs, n, find_mean(xs, n));
	}

	public static double find_var_of_sample_mean(double[] xs, int n) {
		return find_sampvar(xs, n) / n;
	}

	// ----------------------------------------------------------------
	// Vector mean:  xs is a list of nx vectors, each having dim elements.
	public static void find_vector_means(
		double[][] xs,    // Input.  Dimensions nx by dim.
		int        nx,
		int        dim,
		double[]   means) // Output.  Dimension nx.
	{
		for (int i = 0; i < nx; i++)
			means[i] = find_mean(xs[i], dim);
	}

	// ----------------------------------------------------------------
	// Vector stddev:  xs is a list of nx vectors, each having dim elements.
	public static void find_vector_stddevs(
		double[][] xs,      // Input.   Dimensions nx by dim.
		int        nx,
		int        dim,
		double[]   means,   // Input.   Dimension nx.
		double[]   stddevs) // Output.  Dimension nx.
	{
		for (int i = 0; i < nx; i++)
			stddevs[i] = find_stddev(xs[i], dim, means[i]);
	}

	// ----------------------------------------------------------------
	// Sample covariance of two lists.
	public static double find_sample_covariance(
		double[] xs, double[] ys, int n)
	{
		double mean_x = find_mean(xs, n);
		double mean_y = find_mean(ys, n);
		double sum = 0.0;

		for (int i = 0; i < n; i++)
			sum += (xs[i] - mean_x) * (ys[i] - mean_y);

		return sum / (n - 1);
	}

	// ----------------------------------------------------------------
	// Sample correlation of two lists:
	// Corr(X,Y) = Cov(X,Y) / sigma_X sigma_Y.
	public static double find_sample_correlation(
		double[] xs, double[] ys, int n)
	{
		double cov = find_sample_covariance(xs, ys, n);
		double mean_x   = find_mean(xs, n);
		double mean_y   = find_mean(ys, n);
		double stddev_x = find_stddev(xs, n, mean_x);
		double stddev_y = find_stddev(ys, n, mean_y);
		return cov / (stddev_x * stddev_y);
	}

	// ----------------------------------------------------------------
	// find_sample_covariance_matrix
	// ----------------------------------------------------------------
	//
	// There are nx vectors x, each having dim elements.
	// The sample covariance matrix Q is n x n with elements
	//   q[i][j] = (1/(N-1)) sum_{k=1}^N (x[k][i] - mu[i])(x[k][j] - mu[j])
	// where k indexes the set of x's and i,j index elements within the k'th
	// vector.  I.e. the ij'th entry of Q is the (scalar) sample covariance of
	// xi and xj.  In particular, the diagonal entries are the variances of the
	// xi.
	//
	// Example:
	// N = 3 and n = 2.
	//
	// x0 = [ 1 ]  x1 = [ 2 ]  x2 = [ 3 ]
	//      [-4 ]       [-5 ]       [-6 ]
	//
	// mu = [ 2 ]
	//      [-5 ]
	//
	// q[0][0] = (x[0][0] - mu[0]) * (x[0][0] - mu[0])
	//         + (x[1][0] - mu[0]) * (x[1][0] - mu[0])
	//         + (x[2][0] - mu[0]) * (x[2][0] - mu[0])
	//
	//         = ( 1 - 2) * ( 1 - 2)
	//         + ( 2 - 2) * ( 2 - 2)
	//         + ( 3 - 2) * ( 3 - 2)
	//
	//         = 1 + 0 + 1 =  2
	//
	// q[0][1] = (x[0][0] - mu[0]) * (x[0][1] - mu[1])
	//         + (x[1][0] - mu[0]) * (x[1][1] - mu[1])
	//         + (x[2][0] - mu[0]) * (x[2][1] - mu[1])
	//
	//         = ( 1 - 2) * (-4 + 5)
	//         + ( 2 - 2) * (-5 + 5)
	//         + ( 3 - 2) * (-6 + 5)
	//
	//         =-1 + 0 - 1 = -2
	//
	// q[1][0] = (x[0][1] - mu[1]) * (x[0][0] - mu[0])
	//         + (x[1][1] - mu[1]) * (x[1][0] - mu[0])
	//         + (x[2][1] - mu[1]) * (x[2][0] - mu[0])
	//
	//         = (-4 + 5) * ( 1 - 2)
	//         + (-5 + 5) * ( 2 - 2)
	//         + (-6 + 5) * ( 3 - 2)
	//
	//         =-1 + 0 - 1 = -2
	//
	// q[1][1] = (x[0][1] - mu[1]) * (x[0][1] - mu[1])
	//         + (x[1][1] - mu[1]) * (x[1][1] - mu[1])
	//         + (x[2][1] - mu[1]) * (x[2][1] - mu[1])
	//
	//         = (-4 + 5) * (-4 + 5)
	//         + (-5 + 5) * (-5 + 5)
	//         + (-6 + 5) * (-6 + 5)
	//
	//         = 1 + 0 + 1 =  2
	//
	// Q = [ 2 -2 ] * (1/2)
	//     [-2  2 ]
	//
	//   = [ 1 -1 ]
	//     [-1  1 ]

	public static void find_sample_covariance_matrix(
		double[][] xs, // Input.   Dimensions nx by dim.
		int        nx,
		int        dim,
		double[][] Q)  // Output.  Dimensions dim by dim.
	{
		double sum;
		// Might be better passed in as workspace ...
		double means[] = new double[nx];

		find_vector_means(xs, nx, dim, means);
		for (int i = 0; i < dim; i++) {
			for (int j = 0; j < dim; j++) {
				sum = 0.0;
				for (int k = 0; k < nx; k++)
					sum += (xs[k][i] - means[i]) * (xs[k][j] - means[j]);
				Q[i][j] = sum / (nx-1.0);
			}
		}
	}

	// ----------------------------------------------------------------
	// Corr(X,Y)   = Cov(X,Y)   / sigma_X  sigma_Y
	// Corr(Xi,Xj) = Cov(Xi,Xj) / sigma_Xi sigma_Xj
	// The ijth entry of the correlation matrix is the correlation of Xi and Xj.

	public static void find_sample_correlation_matrix(
		double[][] xs, // Input.   Dimensions nx by dim.
		int        nx,
		int        dim,
		double[][] Q)  // Output.  Dimensions dim by dim.
	{
		// Might be better passed in as workspace ...
		double means[]   = new double[nx];
		// Might be better passed in as workspace ...
		double stddevs[] = new double[nx];

		find_sample_covariance_matrix(xs, nx, dim, Q);
		find_vector_means  (xs, nx, dim, means);
		find_vector_stddevs(xs, nx, dim, means, stddevs);

		for (int i = 0; i < nx; i++)
			for (int j = 0; j < nx; j++)
				Q[i][j] /= stddevs[i] * stddevs[j];
	}

	// ----------------------------------------------------------------
	public static double find_min(double[] xs, int n) {
		double lo = xs[0];
		for (int i = 1; i < n; i++)
			if (xs[i] < lo)
				lo = xs[i];
		return lo;
	}

	// ----------------------------------------------------------------
	public static double find_max(double[] xs, int n) {
		double hi = xs[0];
		for (int i = 1; i < n; i++)
			if (xs[i] > hi)
				hi = xs[i];
		return hi;
	}

	// ----------------------------------------------------------------
	// lohi[] must be an array of length 2.  The min is placed into slot 0
	// and the max is placed into slot 1.
	public static void find_bounds(double[] xs, int n, double[] lohi) {
		double lo = xs[0];
		double hi = lo;
		for (int i = 1; i < n; i++) {
			if (xs[i] < lo)
				lo = xs[i];
			if (xs[i] > hi)
				hi = xs[i];
		}
		lohi[0] = lo;
		lohi[1] = hi;
	}

	// ----------------------------------------------------------------
	public static void make_histogram(double[] xs, int n, double lo, double hi,
		double bins[], int num_bins)
	{
		// This is loop-invariant; hoist it out.
		double mul = num_bins / (hi - lo);

		for (int i = 0; i < num_bins; i++)
			bins[i] = 0.0;

		for (int i = 0; i < n; i++) {
			double element = xs[i];
			if ((element >= lo) && (element < hi)) {
				int idx = (int)((element-lo) * mul);
				bins[idx] ++;
			}
		}
	}

	// ----------------------------------------------------------------
	public static void make_histogram_labels(
		double   lo,
		double   hi,
		int      num_bins,
		boolean  do_center,
		double[] labels) // Dimension num_bins.
	{
		double delta = (hi - lo) / num_bins;
		for (int i = 0; i < num_bins; i++) {
			if (do_center)
				labels[i] = lo + (i + 0.5) *delta;
			else
				labels[i] = lo + i * delta;
		}
	}

	// ----------------------------------------------------------------
	// numk == 0 is equivalent to numk == n.
	public static void find_sample_tauint(
		double[] xs,      // Input.  Dimension n.
		int      n,
		double[] tauints, // Output.   Dimension numk.
		int      numk)
	{
		if (numk == 0)
			numk = n;
		// Re-use the tauints[] array as autocorr[] workspace.
		find_sample_autocorr(xs, n, tauints, numk);
		double sum = 1.0;
		tauints[0] = sum;
		for (int k = 1; k < numk; k++) {
			sum += 2 * tauints[k];
			tauints[k] = sum;
		}
	}

	// ----------------------------------------------------------------
	// Computing running sum of estimated integrated autocorrelation time
	// tau_int.  Stop after a flat spot has been found, detected via a turning
	// point.  Be clever about re-using previous sums.
	//
	// See my dissertation for details.  Or, Berg's _Markov Chain Monte Carlo
	// Simulations and Their Statistical Analysis_.
	//
	public static double find_sample_tauint_flat_spot(
		double[] xs, // Input.  Dimension N.
		int      N)
	{
		int winsz, winszm1;
		double x, xim, xjp, sumi, sumj, sumi2, sumj2, sumij, denom;
		double meani, meanj, stdi, stdj;
		int i, j, k, t;
		double autocorr_t, tauint;

		// Within-window sums sumi, sumi2, sumj, and sumj2 contain terms which
		// can be re-used across k.  The cross-sums sumij are different for
		// each k and must be recomputed.
		//
		// Here, compute the full-window sums.
		sumi = 0.0; sumi2 = 0.0;
		for (k = 0; k < N; k++) {
			x = xs[k];
			sumi  += x;
			sumi2 += x*x;
		}
		sumj = sumi; sumj2 = sumi2;

		tauint = 1.0;
		// Go up only to t=N-2.  The autocorr estimator for t=N-1 doesn't work
		// (only one sample), and if we haven't found a flat spot of tauint by
		// then, we aren't going to.
		for (t = 1; t < N-1; t++) {
			winsz = N - t;
			winszm1 = winsz - 1;
			denom = winszm1;
			if (winszm1 == 0)
				denom = 1;
			i = t; j = N-t-1;

			// Update the within-window sums.
			xim = xs[i-1];  xjp = xs[j+1];
			sumi -= xim; sumi2 -= xim*xim;
			sumj -= xjp; sumj2 -= xjp*xjp;

			// Compute the cross-sum.
			sumij = 0.0;
			for (k = 0; k < winsz; k++)
				sumij += xs[k] * xs[t+k];

			// Compute the autocorrelation term for this t.
			meani = sumi / winsz; meanj = sumj / winsz;
			stdi  = Math.sqrt((sumi2 - (sumi*sumi / winsz)) / denom);
			stdj  = Math.sqrt((sumj2 - (sumj*sumj / winsz)) / denom);

			autocorr_t = 0.0;
			if ((stdi != 0.0) && (stdj != 0.0))
				autocorr_t = (sumij / winsz - (meani*meanj)) / (stdi*stdj);
			tauint += 2.0 * autocorr_t;

			if (autocorr_t < 0.0)
				return tauint;
		}

		return tauint;
	}

	// ----------------------------------------------------------------
	// Autocorrelation of (X_0, ..., X_{N-1}):
	// E[X_i X_j] - mu^2
	// -----------------.
	//     sigma^2
	//
	// An efficient autocorrelation: the cross-window sums must be computed for
	// each k, because they differ for each k.  But the within-window sums may
	// be accumulated, as long as we start with k at the end and work our way
	// backward to the beginning.
	//
	// Example with N = 8:
	//
	// [ 0               ]         k = 7
	// [ o               ]
	// [               o ]
	// [               7 ]
	//
	// [ 0 1             ]         k = 6
	// [ o o             ]
	// [             o o ]
	// [             6 7 ]
	//
	// ...
	//
	// [ 0 1 2 3 4 5 6   ]         k = 1
	// [ o o o o o o o   ]
	// [   o o o o o o o ]
	// [   1 2 3 4 5 6 7 ]
	//
	// [ 0 1 2 3 4 5 6 7 ]         k = 0
	// [ o o o o o o o o ]
	// [ o o o o o o o o ]
	// [ 0 1 2 3 4 5 6 7 ]

	// ----------------------------------------------------------------
	// Pass numk=0 or numk=N to compute all possible autocorrelations.
	// Pass numk = something smaller to compute only the first numk
	// autocorrelations.
	public static void find_sample_autocorr(
		double[] xs,       // Input.  Dimension N.
		int      N,
		double[] autocorr, // Output.  Dimension numk.
		int      numk)
	{
		int i, j, k;
		int winsz, winszm1;
		double xi, xj;

		if (numk == 0)
			numk = N;
		for (k = 0; k < numk; k++)
			autocorr[k] = 0.0;

		if ((numk <= 1) || (numk > N)) {
			System.err.println("find_sample_autocorr: numk must be > 1 and <= N="
				+ N + "; got " + numk + ".");
			return;
		}

		// Sums over first and second windows
		double sumi  = 0.0; double sumi2 = 0.0;
		double sumj  = 0.0; double sumj2 = 0.0;

		// k = N-1: accumulate sums, but set autocorr to zero.  Sample sizes
		// are 1; standard deviations would get a division by zero.
		for (i = 0, j = N-1, k = N-1; k >= numk; k--, i++, j--) {
			xi     = xs[i]; xj     = xs[j];
			sumi  += xi;    sumi2 += xi*xi;
			sumj  += xj;    sumj2 += xj*xj;
		}

		// k = N-2 down to 1.
		for (k = numk - 1; k >= 1; k--, i++, j--) {
			xi     = xs[i]; xj     = xs[j];
			sumi  += xi;    sumi2 += xi*xi;
			sumj  += xj;    sumj2 += xj*xj;
			winsz   = N-k;  winszm1 = winsz - 1;
			if (k == N-1) // Leave autocorr[N-1] = 0.0 to avoid division by zero.
				continue;

			double sumij = 0.0;
			for (int ell = 0; ell < winsz; ell++)
				sumij += xs[ell] * xs[ell+k];

			double meani = sumi / winsz;
			double meanj = sumj / winsz;
			double stdi  = Math.sqrt((sumi2 - (sumi*sumi / winsz)) / winszm1);
			double stdj  = Math.sqrt((sumj2 - (sumj*sumj / winsz)) / winszm1);

			if ((stdi == 0.0) || (stdj == 0.0))
				autocorr[k] = 0.0;
			else
				autocorr[k] = (sumij / winsz - (meani*meanj)) / (stdi*stdj);
		}

		// k = 0: sumij, sumi2, and sumj2 are all the same.
		k       = 0;
		winsz   = N; winszm1 = N-1;
		xi      = xs[N-1];
		sumi   += xi;
		sumi2  += xi*xi;
		double meani   = sumi / N;
		double stdi    = Math.sqrt((sumi2 - (sumi*sumi / winsz)) / winszm1);

		autocorr[0]  = (sumi2 / N - (meani*meani)) / (stdi*stdi);
	}

	// ----------------------------------------------------------------
	// Univariate linear regression
	// ----------------------------------------------------------------
	// There are N (xi, yi) pairs.
	//
	// E = sum (yi - m xi - b)^2
	//
	// DE/Dm = sum 2 (yi - m xi - b) (-xi) = 0
	// DE/Db = sum 2 (yi - m xi - b) (-1)  = 0
	//
	// sum (yi - m xi - b) (xi) = 0
	// sum (yi - m xi - b)      = 0
	//
	// sum (xi yi - m xi^2 - b xi) = 0
	// sum (yi - m xi - b)         = 0
	//
	// m sum(xi^2) + b sum(xi) = sum(xi yi)
	// m sum(xi)   + b N       = sum(yi)
	//
	// [ sum(xi^2)   sum(xi) ] [ m ] = [ sum(xi yi) ]
	// [ sum(xi)     N       ] [ b ] = [ sum(yi)    ]
	//
	// [ m ] = [ sum(xi^2) sum(xi) ]^-1  [ sum(xi yi) ]
	// [ b ]   [ sum(xi)   N       ]     [ sum(yi)    ]
	//
	//       = [ N         -sum(xi)  ]  [ sum(xi yi) ] * 1/D
	//         [ -sum(xi)   sum(xi^2)]  [ sum(yi)    ]
	//
	// where
	//
	//   D = N sum(xi^2) - sum(xi)^2.
	//
	// So
	//
	//      N sum(xi yi) - sum(xi) sum(yi)
	// m = --------------------------------
	//
	//      -sum(xi)sum(xi yi) + sum(xi^2) sum(yi)
	// b = ----------------------------------------
	//                   D

	public static void linear_regression(
		double[] xs,
		double[] ys,
		int      n,
		double[] m_b_sm_sb)
	{
		double sumxi   = 0.0;
		double sumyi   = 0.0;
		double sumxiyi = 0.0;
		double sumxi2  = 0.0;

		for (int i = 0; i < n; i++) {
			double x = xs[i];
			double y = ys[i];
			sumxi   += x;
			sumyi   += y;
			sumxiyi += x*y;
			sumxi2  += x*x;
		}

		double D =  n * Math.pow(sumxi2 - sumxi, 2);
		double m = (n * sumxiyi - sumxi * sumyi) / D;
		double b = (-sumxi * sumxiyi + sumxi2 * sumyi) / D;

		// Young 1962, pp. 122-124.  Compute sample variance of linear
		// approximations, then variances of m and b.
		double var_z = 0.0;
		for (int i = 0; i < n; i++)
			var_z += Math.pow(m * xs[i] + b - ys[i], 2);
		var_z /= n;

		double var_m = (n * var_z) / D;
		double var_b = (var_z * sumxi2) / D;

		m_b_sm_sb[0] = m;
		m_b_sm_sb[1] = b;
		m_b_sm_sb[2] = Math.sqrt(var_m);
		m_b_sm_sb[3] = Math.sqrt(var_b);
	}

	// ----------------------------------------------------------------
	public static double get_corr_coeff(double[] xs, double[] ys, int n) {
		double sumxi   = 0.0;
		double sumyi   = 0.0;
		double sumxiyi = 0.0;
		double sumxi2  = 0.0;
		double sumyi2  = 0.0;

		for (int i = 0; i < n; i++) {
			double x = xs[i];
			double y = ys[i];
			sumxi   += x;
			sumyi   += y;
			sumxiyi += x*y;
			sumxi2  += x*x;
			sumyi2  += y*y;
		}

		// Young 1962, p. 130.
		double a = n*sumxiyi - sumxi*sumyi;
		double b = n*sumxi2  - sumxi*sumxi;
		double c = n*sumyi2  - sumyi*sumyi;
		double corr_coeff = a / Math.sqrt(b*c);
		return corr_coeff;
	}

} // end class Stats
