//nn.c (gcc nn.c -o nn -lm)
//nonparametric classifier based on nearest-neighbor technique, 2-dimensional
//Assumes equal priors and equally-sized training data.
//Bryan Topp <betopp@cs.unm.edu>

#include <stdio.h>
#include <stdlib.h>
#include <math.h>


typedef struct
{
	double x;
	double y;
} datapoint_t;

//Reads a dataset from a file. Allocates and returns an array and its size.
void read_dataset(const char *filename, double **array_out, int *size_out)
{
	FILE *datafile = fopen(filename, "r");
	if(!datafile)
	{
		printf("Could not open %s\n", filename);
		exit(-1);
	}
	
	//Read through once to find out how many entries we have
	int num_points = 0;
	
	while(1)
	{
		int dummy1;
		double dummy2, dummy3;
		
		if(fscanf(datafile, " %d %lf %lf ", &dummy1, &dummy2, &dummy3) != 3)
			break;
		
		num_points++;
	}
	
	if(!feof(datafile)) //We should only finish at the end of the file
	{
		printf("error parsing file %s\n", filename);
		exit(-1);
	}
	
	printf("Read %d data points from file %s\n", num_points, filename);
	
	double *dataset = (double*)malloc(sizeof(double)*num_points*2);
	
	//Now actually read the points.
	rewind(datafile);
	num_points = 0;
	while(1)
	{
		
		int dummy;
		
		if(fscanf(datafile, " %d %lf %lf ", &dummy, &(dataset[num_points*2]), &(dataset[(num_points*2)+1])) != 3)
			break;
		
		num_points++;
	}	
	
	fclose(datafile);
	
	*size_out = num_points;
	*array_out = dataset;
}


double datadist(datapoint_t *a, datapoint_t *b)
{
	return ((a->x - b->x)*(a->x - b->x))+((a->y - b->y)*(a->y - b->y));
}


double smallest_distancesq(datapoint_t *eval_pt, datapoint_t *training_pts, int num_training_pts)
{
	double smallest_dist = datadist(eval_pt, &(training_pts[0]));
	int i;
	for(i=1;i<num_training_pts;i++)
	{
		double this_dist = datadist(eval_pt, &(training_pts[i]));
		if(this_dist < smallest_dist)
			smallest_dist = this_dist;
	}
	
	return smallest_dist;
}

int main(int argc, const char **argv)
{
	if(argc != 5)
	{
		printf("usage: %s <training 1> <training 2> <testing 1> <testing 2>\n", argv[0]);
		exit(-1);
	}
	
	//Read datasets.
	datapoint_t *c1_training;
	int c1_training_size;
	
	datapoint_t *c2_training;
	int c2_training_size;
	
	datapoint_t *c1_testing;
	int c1_testing_size;
	
	datapoint_t *c2_testing;
	int c2_testing_size;
	
	read_dataset(argv[1], (double**)(&c1_training), &c1_training_size);
	read_dataset(argv[2], (double**)(&c2_training), &c2_training_size);
	read_dataset(argv[3], (double**)(&c1_testing), &c1_testing_size);
	read_dataset(argv[4], (double**)(&c2_testing), &c2_testing_size);
	

	printf("Looking for nearest neighbor\n");
	
	//Evaluate.
	int errors_classifying_c1 = 0;
	int errors_classifying_c2 = 0;
	
	//Classify each testing point in C1
	int testpt;
	for(testpt = 0; testpt < c1_testing_size; testpt++)
	{
		
		double c1_normalized = smallest_distancesq(&(c1_testing[testpt]), c1_training, c1_training_size) * (double)c1_training_size;
		double c2_normalized = smallest_distancesq(&(c1_testing[testpt]), c2_training, c2_training_size) * (double)c2_training_size;
		
		//assume equal priors.
		//look for the smaller radius!
		if(c2_normalized < c1_normalized)
			errors_classifying_c1++;
	}
	
	//Classify each testing point in C2
	for(testpt = 0; testpt < c2_testing_size; testpt++)
	{
		double c1_normalized = smallest_distancesq(&(c2_testing[testpt]), c1_training, c1_training_size) * (double)c1_training_size;
		double c2_normalized = smallest_distancesq(&(c2_testing[testpt]), c2_training, c2_training_size) * (double)c2_training_size;
		
		//assume equal priors.
		//look for the smaller radius!
		if(c2_normalized >= c1_normalized)
			errors_classifying_c2++;
	}
	
	printf("Errors classifying C1 testing set: %d / %d (%lf)\n", errors_classifying_c1, c1_training_size, (double)errors_classifying_c1 / (double)c1_training_size);
	printf("Errors classifying C2 testing set: %d / %d (%lf)\n", errors_classifying_c2, c2_training_size, (double)errors_classifying_c2 / (double)c2_training_size);
	printf("Overall error rate: %d / %d (%lf)\n", errors_classifying_c1+errors_classifying_c2, c1_training_size+c2_training_size,  (double)(errors_classifying_c1+errors_classifying_c2) / (double)(c1_training_size+c2_training_size));	
	
	
	#ifdef GRID_OUTPUT
	FILE *gridout = fopen("nn.grid", "w");
	double gx, gy;
	for(gy=-8.0;gy<8.0;gy += .03125)
	{
		for(gx=-8.0;gx<8.0;gx += .03125)
		{
			datapoint_t gridpoint;
			gridpoint.x = gx;
			gridpoint.y = gy;
	
			double c1_normalized = smallest_distancesq(&(gridpoint), c1_training, c1_training_size) * (double)c1_training_size;
			double c2_normalized = smallest_distancesq(&(gridpoint), c2_training, c2_training_size) * (double)c2_training_size;
		
			
			//assume equal priors.
			//look for the smaller radius!
			if(c2_normalized < c1_normalized)
				fprintf(gridout, "%lf %lf %d\n", gx, gy, 2);
			else if(c2_normalized > c1_normalized)
				fprintf(gridout, "%lf %lf %d\n", gx, gy, 1);
			else 
				fprintf(gridout, "%lf %lf %d\n", gx, gy, 0);
		}
	}
	#endif
	
	return 0;
}