//bayes_estimate.c (gcc bayes_estimate.c -o bayes_estimate -lm)
//estimates minimum (bayes) error for two-class two-dimensional classifier of gaussians
//estimates gaussian parameters from sample mean/variance
//Bryan Topp <betopp@cs.unm.edu>

#include <stdio.h>
#include <math.h>
#include <stdlib.h>

#define EST_RANGE 4.0 //in terms of diagonal terms of covariance
#define EST_RESOLUTION 2048 //how many divisions of range to use


//Computes Gaussian PDF for x given mean/covariance.
double gauss2d(double covar[2][2], double mean[2], double x[2])
{
	/*double exp_term = 0.0;
	exp_term += covar[0][0] * (x[0] - mean[0]) * (x[0] - mean[0]);
	exp_term += covar[0][1] * (x[0] - mean[0]) * (x[1] - mean[1]);
	exp_term += covar[1][0] * (x[1] - mean[1]) * (x[0] - mean[0]);
	exp_term += covar[1][1] * (x[1] - mean[1]) * (x[1] - mean[1]);
	
	double covar_det = (covar[0][0] * covar[1][1]) - (covar[0][1] * covar[1][0]);
	
	return exp(-exp_term) / (2.0 * M_PI * sqrt(covar_det));*/
	double correlation = covar[1][0] / ( sqrt(covar[0][0]) * sqrt(covar[1][1]) );
	
	double normalization_term = 2.0 * M_PI * sqrt(covar[0][0]) * sqrt(covar[1][1]) * sqrt(1.0 - (correlation*correlation));
	
	double exp_term = 0.0;
	exp_term += (x[0] - mean[0]) * (x[0] - mean[0]) / covar[0][0];
	exp_term += (x[1] - mean[1]) * (x[1] - mean[1]) / covar[1][1];
	exp_term += 2.0 * correlation * (x[0] - mean[0]) * (x[1] - mean[1]) / (covar[1][0] / correlation);
	exp_term /= 2.0 * (1 - (correlation * correlation));
	
	return exp(-exp_term) / normalization_term;
	
}

//Opens a training dataset and estimates mean and covariance.
void find_sample_parms(const char *dataset_filename, double covar[2][2], double mean[2])
{
	FILE *datafile = fopen(dataset_filename, "r");
	if(!datafile)
	{
		printf("Error opening dataset %s.\n", dataset_filename);
		exit(-1);
	}
	
	//Find sample mean...
	mean[0] = 0.0;
	mean[1] = 0.0;
	int points_read = 0;
	
	//Accumulate points.
	while(1)
	{
		int point;
		double x;
		double y;
		
		if(fscanf(datafile, " %d %lf %lf ", &point, &x, &y) != 3)
			break;
		
		points_read++;
		mean[0] += x;
		mean[1] += y;
	}
	
	//Normalize.
	printf("Read %d points in dataset %s\n", points_read, dataset_filename);
	mean[0] = mean[0] / (double)points_read;
	mean[1] = mean[1] / (double)points_read;

	//Find sample covariance...	
	covar[0][0] = 0.0;
	covar[0][1] = 0.0;
	covar[1][0] = 0.0;
	covar[1][1] = 0.0;
	
	rewind(datafile);
	
	while(1)
	{
		int point;
		double x;
		double y;
		
		if(fscanf(datafile, " %d %lf %lf ", &point, &x, &y) != 3)
			break;
		
		covar[0][0] += (x - mean[0]) * (x - mean[0]);
		covar[0][1] += (x - mean[0]) * (y - mean[1]);
		covar[1][0] += (y - mean[1]) * (x - mean[0]);
		covar[1][1] += (y - mean[1]) * (y - mean[1]);
		
	}		
	
	covar[0][0] = covar[0][0] / (double)(points_read-1);
	covar[0][1] = covar[0][1] / (double)(points_read-1);
	covar[1][0] = covar[1][0] / (double)(points_read-1);
	covar[1][1] = covar[1][1] / (double)(points_read-1);
}

int main(int argc, const char **argv)
{
	if(argc != 3)
	{
		printf("Usage: %s <dataset1> <dataset2>\n", argv[0]);
		exit(-1);
	}
	
	//Estimate gaussian parameters from datasets
	double covar_c1[2][2];
	double mean_c1[2];
	
	double covar_c2[2][2];
	double mean_c2[2];
	
	find_sample_parms(argv[1], covar_c1, mean_c1);
	find_sample_parms(argv[2], covar_c2, mean_c2);
	
	printf("Dataset 1 mean:\n\t%lf\t%lf\n", mean_c1[0], mean_c1[1]);
	printf("Dataset 1 covariance:\n\t%lf\t%lf\n\t%lf\t%lf\n", covar_c1[0][0], covar_c1[0][1], covar_c1[1][0], covar_c1[1][1]);
	printf("Dataset 2 mean:\n\t%lf, %lf\n", mean_c2[0], mean_c2[1]);
	printf("Dataset 2 covariance:\n\t%lf\t%lf\n\t%lf\t%lf\n", covar_c2[0][0], covar_c2[0][1], covar_c2[1][0], covar_c2[1][1]);
	
	//Pick bounds for estimation - out 4*covariance 
	double est_min[2];
	double est_max[2];
	
	est_min[0] = fmin(mean_c1[0] - (EST_RANGE * covar_c1[0][0]), mean_c2[0] - (EST_RANGE * covar_c2[0][0]));
	est_min[1] = fmin(mean_c1[1] - (EST_RANGE * covar_c1[1][1]), mean_c2[1] - (EST_RANGE * covar_c2[1][1]));
	
	est_max[0] = fmax(mean_c1[0] + (EST_RANGE * covar_c1[0][0]), mean_c2[0] + (EST_RANGE * covar_c2[0][0]));
	est_max[1] = fmax(mean_c1[1] + (EST_RANGE * covar_c1[1][1]), mean_c2[1] + (EST_RANGE * covar_c2[1][1]));
	
	printf("Estimating over bounds %lf, %lf to %lf, %lf\n", est_min[0], est_min[1], est_max[0], est_max[1]);
	printf("Using %d * %d total grid points = %d\n", EST_RESOLUTION, EST_RESOLUTION, EST_RESOLUTION*EST_RESOLUTION);
	
	//Integration in squares
	
	//Find area of each "square"
	double subdiv_area = ((1.0 / (double)EST_RESOLUTION) * (est_max[0] - est_min[0])) * ((1.0 / (double)EST_RESOLUTION) * (est_max[1] - est_min[1]));
	
	//initialize "integral" counts
	double class_1_as_2_error = 0.0;
	double class_2_as_1_error = 0.0;
	
	double class_1_total_area = 0.0f; //these should turn out to be 1 if we integrated properly and precisely
	double class_2_total_area = 0.0f; //(PDFs integrate to 1)
	
	//Integrate

	int step_y;
	for(step_y = 0; step_y < EST_RESOLUTION; step_y++)
	{
		int step_x;
		for(step_x = 0; step_x < EST_RESOLUTION; step_x++)
		{
			double evaluation_point[2];
			evaluation_point[0] = est_min[0] + (((double)step_x / (double)EST_RESOLUTION) * (est_max[0] - est_min[0]));
			evaluation_point[1] = est_min[1] + (((double)step_y / (double)EST_RESOLUTION) * (est_max[1] - est_min[1]));
			
			double class_1_prob = gauss2d(covar_c1, mean_c1, evaluation_point);
			double class_2_prob = gauss2d(covar_c2, mean_c2, evaluation_point);
			
			
			//we would decide class 1 here
			//class 2 error increased by its approx. probability over the area
			if(class_1_prob > class_2_prob)
				class_2_as_1_error += subdiv_area * class_2_prob;
			else //vice-versa
				class_1_as_2_error += subdiv_area * class_1_prob;
			
			class_1_total_area += subdiv_area * class_1_prob;
			class_2_total_area += subdiv_area * class_2_prob;
		}
	}
	
	printf("Estimated area from classifying 1 as 2: %lf\n", class_1_as_2_error);
	printf("Estimated area from classifying 2 as 1: %lf\n", class_2_as_1_error);
	printf("Estimated Bayes error assuming equal priors: %lf\n", (class_1_as_2_error + class_2_as_1_error) / 2.0);
	printf("Estimates of probability density functions have total area-under-curve %lf and %lf\n", class_1_total_area, class_2_total_area);
	
	return 0;
}