/*=================================================================
 *
 *  nvtspmex.c
 *  Author: Andrew Magis
 *  Calculate TSP scores on the GPU
 *  Inputs: Class 1 data, Class 2 data, N (size of cross-validation)
 *  Outputs: TSP primary scores, TSP secondary scores, TSP upper and lower bounds for CV
 *
 *=================================================================*/

#include <math.h>
#include "mex.h"
#include <vector>

//#define DEBUG 

void DisplayDeviceProperties(int device) {

    cudaDeviceProp deviceProp;
    memset(&deviceProp, 0, sizeof (deviceProp));
	
	printf("-----\n");
	
    if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, device)) {
		printf("Device Name\t\t\t\t%s\n", deviceProp.name);
		printf("Total Global Memory\t\t\t%ld KB\n",deviceProp.totalGlobalMem / 1024);
		printf("Maximum threads per block\t\t%d\n", deviceProp.maxThreadsPerBlock);
		
    } else {
        printf("\n%s", cudaGetErrorString(cudaGetLastError()));
    }
	
	printf("------\n");				
		
}

#define THREADS 16
#define ABSMACRO(X) ( ((X)<0) ? (-(X)) : (X) )
#define ABSBINARYMACRO(X) ( ((X)<0) ? (0) : (1) )
#define MINMACRO(X,Y) ( ((X)<(Y)) ? (X) : (Y) )
#define MAXMACRO(X,Y) ( ((X)>(Y)) ? (X) : (Y) )

__global__ void tspKernel(float *d_class1, float *d_class2, unsigned int n1, unsigned int n2, unsigned int m, float *primary, float *secondary, int *vote) {
	
    float class1_score = 0.f;
	float class2_score = 0.f;
	float class1_rank = 0.f;
	float class2_rank = 0.f;	
	float n1_invert = __fdividef(1.f, (float)n1);
	float n2_invert = __fdividef(1.f, (float)n2);
	
	//We are only building a diagonal matrix here, so return if I am part of the diagonal
	//or below the diagonal
	if ((blockIdx.x*blockDim.x+threadIdx.x) > (blockIdx.y*blockDim.y+threadIdx.y)) {
	
		//Pointers to correct memory location for class1
		float *data1 = &d_class1[(blockIdx.x*blockDim.x + threadIdx.x)];
		float *data2 = &d_class1[(blockIdx.y*blockDim.y + threadIdx.y)];

		for (int i = 0; i < n1*m; i+=m) {
			class1_score += signbit(data1[i]-data2[i]);
			class1_rank += (float)(data1[i]-data2[i]);
		}		
		class1_score = class1_score * n1_invert;
		class1_rank = class1_rank * n1_invert;

		//Pointers to correct memory location for class2
		data1 =  &d_class2[(blockIdx.x*blockDim.x + threadIdx.x)];
		data2 =  &d_class2[(blockIdx.y*blockDim.y + threadIdx.y)];
	
		for (int i = 0; i < n2*m; i+=m) {
			class2_score += signbit(data1[i]-data2[i]);
			class2_rank += (float)(data1[i]-data2[i]);
		}	
		class2_score = class2_score * n2_invert;
		class2_rank = class2_rank * n2_invert;
		
	}
	
	//Write the result to global memory
	primary[(blockIdx.x*blockDim.x + threadIdx.x)*m + (blockIdx.y*blockDim.y + threadIdx.y)] = ABSMACRO(class1_score-class2_score);
	secondary[(blockIdx.x*blockDim.x + threadIdx.x)*m + (blockIdx.y*blockDim.y + threadIdx.y)] = ABSMACRO(class1_rank-class2_rank);
	vote[(blockIdx.x*blockDim.x + threadIdx.x)*m + (blockIdx.y*blockDim.y + threadIdx.y)] = ABSBINARYMACRO(class1_score-class2_score);

}

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray*prhs[]) { 
	
#ifdef DEBUG	
	DisplayDeviceProperties(0);
	
	//Time the execution of this function
	cudaEvent_t start_event, stop_event;
	cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);
    cudaEventRecord(start_event, 0);
    cudaEventSynchronize(start_event);
	float time_run;
#endif		
		
	//Error check
	if (nrhs != 2) {
		mexErrMsgTxt("Two inputs required (class 1 ranks, class 2 ranks)");
	}
	if (nlhs != 3) {
		mexErrMsgTxt("Three outputs required (TSP primary scores, TSP secondary scores, vote)");
	}
    // The input must be a noncomplex single.
    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS || mxIsComplex(prhs[0])) {
        mexErrMsgTxt("Class1 Input must be a noncomplex single.");
    }
	if (mxGetClassID(prhs[1]) != mxSINGLE_CLASS || mxIsComplex(prhs[1])) {
        mexErrMsgTxt("Class2 Input must be a noncomplex single.");
    }		
		
	//m is the number of rows (genes)
	//n is the number of chips (samples)
	unsigned int m1 = mxGetM(prhs[0]);
	unsigned int n1 = mxGetN(prhs[0]);
	unsigned int m2 = mxGetM(prhs[1]);
	unsigned int n2 = mxGetN(prhs[1]);
	if (m1 != m2) {
		mexErrMsgTxt("Number of genes for class 1 != class 2\n");
	}	
		
	//Create a padded m which is multiple of THREADS
	unsigned int m;
	if (m1 % THREADS == 0) {
		m = m1;
	} else {
		m = ((int)(m1 / THREADS) + 1) * THREADS;
	}
	
#ifdef DEBUG	
	printf("Class1 Ranks: [%d, %d] Class2 Ranks: [%d, %d]\n", m1, n1, m2, n2);
	printf("Thread Dimension: %d Padded length: %d\n", THREADS, m);
#endif
	
	// Create an mxArray for the output data - this is automatically zeroed out
	plhs[0] = mxCreateNumericMatrix(m1, m1, mxSINGLE_CLASS, mxREAL);	
	plhs[1] = mxCreateNumericMatrix(m1, m1, mxSINGLE_CLASS, mxREAL);
	plhs[2] = mxCreateNumericMatrix(m1, m1, mxINT32_CLASS, mxREAL);	
	
	unsigned long int class1_size = m*n1 * sizeof(float);
	unsigned long int class2_size = m*n2 * sizeof(float);
	unsigned long int result_size = m*m * sizeof(float);
	
	//Allocate space on the GPU to store the input data
	float *d_class1, *d_class2;
	if ( cudaMalloc( (void**)&d_class1, class1_size ) != cudaSuccess )
       	    mexErrMsgTxt("Memory allocating failure on the GPU.");
	if ( cudaMalloc( (void**)&d_class2, class2_size )  != cudaSuccess )
            mexErrMsgTxt("Memory allocating failure on the GPU.");
			
	//Allocate space on the GPU to store the output data
	float *d_s1, *d_s2, *d_s5;	
    if ( cudaMalloc( (void**)&d_s1, result_size )  != cudaSuccess )
			mexErrMsgTxt("Memory allocating failure on the GPU.");
    if ( cudaMalloc( (void**)&d_s2, result_size )  != cudaSuccess )
			mexErrMsgTxt("Memory allocating failure on the GPU.");
    if ( cudaMalloc( (void**)&d_s5, result_size )  != cudaSuccess )
			mexErrMsgTxt("Memory allocating failure on the GPU.");
			
	//Reallocate space for the data on the host with zeroed out padding
	float *h_class1, *h_class2, *h_s1, *h_s2, *h_s5;
	if (cudaMallocHost((void**)&h_class1, class1_size) != cudaSuccess) 
		mexErrMsgTxt("Memory allocating failure on the host.");
	if (cudaMallocHost((void**)&h_class2, class2_size) != cudaSuccess)
		mexErrMsgTxt("Memory allocating failure on the host.");
	if (cudaMallocHost((void**)&h_s1, result_size) != cudaSuccess) 
		mexErrMsgTxt("Memory allocating failure on the host.");
	if (cudaMallocHost((void**)&h_s2, result_size) != cudaSuccess) 
		mexErrMsgTxt("Memory allocating failure on the host.");
	if (cudaMallocHost((void**)&h_s5, result_size) != cudaSuccess) 
		mexErrMsgTxt("Memory allocating failure on the host.");	
		
	//Zero out the memory on the host
	memset(h_class1, 0, class1_size);
	memset(h_class2, 0, class2_size);
	
	//Copy over data to new padded array location on host
	float *temp = h_class1;
	float *mtemp = (float*)mxGetData(prhs[0]);
	for (int i = 0; i < n1; i++) {
		memcpy(temp, mtemp, m1*sizeof(float));
		mtemp += m1;
		temp += m;
	}	
	temp = h_class2;
	mtemp = (float*)mxGetData(prhs[1]);
	for (int i = 0; i < n2; i++) {
		memcpy(temp, mtemp, m1*sizeof(float));
		mtemp += m1;
		temp += m;
	}		
							
	//Copy data to the GPU
	if (cudaMemcpy(d_class1, h_class1, class1_size, cudaMemcpyHostToDevice) != cudaSuccess)
		mexErrMsgTxt("Error copying memory to the GPU.");
	if (cudaMemcpy(d_class2, h_class2, class2_size, cudaMemcpyHostToDevice) != cudaSuccess)
		mexErrMsgTxt("Error copying memory to the GPU.");
				
	//Set the dimension of the blocks and grids
	dim3 dimBlock(THREADS, THREADS);
	dim3 dimGrid(m/THREADS, m/THREADS);
	
#ifdef DEBUG	
	printf("Scheduling [%d %d] threads in [%d %d] blocks\n", THREADS, THREADS, m/THREADS, m/THREADS);
#endif

	tspKernel<<<dimGrid, dimBlock>>>(d_class1, d_class2, n1, n2, m, d_s1, d_s2, (int*)d_s5);
	cudaThreadSynchronize();
		
	//Copy the memory back
	if (cudaMemcpy(h_s1, d_s1, result_size, cudaMemcpyDeviceToHost) != cudaSuccess) 
		mexErrMsgTxt("Error copying memory from the GPU.");
	if (cudaMemcpy(h_s2, d_s2, result_size, cudaMemcpyDeviceToHost) != cudaSuccess) 
		mexErrMsgTxt("Error copying memory from the GPU.");
	if (cudaMemcpy(h_s5, d_s5, result_size, cudaMemcpyDeviceToHost) != cudaSuccess) 
		mexErrMsgTxt("Error copying memory from the GPU.");	
		
	float *gpu_output1 = h_s1, *gpu_output2 = h_s2, *gpu_output5 = h_s5;
	float *matlab_output1 = (float*) mxGetData(plhs[0]);		
	float *matlab_output2 = (float*) mxGetData(plhs[1]);
	float *matlab_output5 = (float*) mxGetData(plhs[2]);
	
	//Finally, copy the padded array data into the output matrix
	for (int i = 0; i < m1; i++) {
		memcpy(matlab_output1, gpu_output1, m1*sizeof(float));
		memcpy(matlab_output2, gpu_output2, m1*sizeof(float));
		memcpy(matlab_output5, gpu_output5, m1*sizeof(float));			
		matlab_output1 += m1; matlab_output2 += m1; matlab_output5 += m1;
		gpu_output1 += m; gpu_output2 += m; gpu_output5 += m;
	}		
		
#ifdef DEBUG			
	cudaEventRecord(stop_event, 0);
	cudaEventSynchronize(stop_event); // block until the event is actually recorded
	cudaEventElapsedTime(&time_run, start_event, stop_event);
	printf("Finished running nvTSP in %.6f seconds\n", time_run / 1000.0);
	cudaEventRecord(start_event, 0);
    cudaEventSynchronize(start_event);	
#endif
	
	//Clear up memory on the device
	cudaFree(d_class1);
	cudaFree(d_class2);
	cudaFree(d_s1); 
	cudaFree(d_s2);
	cudaFree(d_s5);
	
	//Clear up memory on the host
	cudaFreeHost(h_class1);
	cudaFreeHost(h_class2);
	cudaFreeHost(h_s1); 
	cudaFreeHost(h_s2);
	cudaFreeHost(h_s5);
		
}

//Old kernel to calculate upper and lower bounds for pruning algorithm.
/* 
__global__ void tspKernel(float *d_class1, float *d_class2, unsigned int n1, unsigned int n2, unsigned int m, unsigned int n, float *primary, float *secondary, float *lower, float *upper, int *vote) {
	
    float class1_score = 0.f;
	float class2_score = 0.f;
	float class1_rank = 0.f;
	float class2_rank = 0.f;	
	float minimum = 0.f;
	float maximum = 0.f;
	
	float a = 0.f, b = 0.f, c = 0.f, d = 0.f;
	
	//We are only building a diagonal matrix here, so return if I am part of the diagonal
	//or below the diagonal
	if ((blockIdx.x*blockDim.x+threadIdx.x) > (blockIdx.y*blockDim.y+threadIdx.y)) {
	
		//Pointers to correct memory location for class1
		float *data1 = &d_class1[(blockIdx.x*blockDim.x + threadIdx.x)];
		float *data2 = &d_class1[(blockIdx.y*blockDim.y + threadIdx.y)];

		for (int i = 0; i < n1*m; i+=m) {
			if (data1[i] < data2[i]) {
				a += 1.f;
			} else {
				b += 1.f;
			}
			class1_rank += (float)(data1[i]-data2[i]);
		}
		
		class1_score = a / (a+b);
		class1_rank = class1_rank / (a+b);

		//Pointers to correct memory location for class2
		data1 =  &d_class2[(blockIdx.x*blockDim.x + threadIdx.x)];
		data2 =  &d_class2[(blockIdx.y*blockDim.y + threadIdx.y)];
	
		for (int i = 0; i < n2*m; i+=m) {
			if (data1[i] < data2[i]) {
				c += 1.f;
			} else {
				d += 1.f;
			}			
			class2_rank += (float)(data1[i]-data2[i]);
		}
		
		class2_score = c / (c+d);
		class2_rank = class2_rank / (c+d);
		
		float min1, min2, max1, max2;

		//Calculate the min and max for the cross-validation pruning algorithm
		if (class1_score >= class2_score) {
		
			if ((a<n) || (d<n)) {
				minimum = 0.f;
			} else {
				min1 = ((a-n)/(a-n+b)) - class2_score;
				min2 = class1_score - (c/(c+d-n));
				minimum = MINMACRO(min1, min2);
			}
			if ((b<n) || (c<n)) {
				maximum = 1.f;
			} else {
				max1 = (a/(a+b-n)) - class2_score;
				max2 = class1_score - ((c-n)/(c-n+d));
				maximum = MAXMACRO(max1, max2);
			}

		} else {
			if ((b<n) || (c<n)) {
				minimum = 0.f;
			} else {
				min1 = ((b-n)/(a+b-n)) - (1.f-class2_score);
				min2 = (1.f-class1_score) - (d/(c-n+d));
				minimum = MINMACRO(min1, min2);
			}
			if ((a<n) || (d<n)) {
				maximum = 1.f;
			} else {
				max1 = (b/(a-n+b))-(1.f-class2_score);
				max2 = (1.f-class1_score) - ((d-n)/(c+d-n));
				maximum = MAXMACRO(max1, max2);
			}
		}
	}
	
	//Write the result to global memory
	primary[(blockIdx.x*blockDim.x + threadIdx.x)*m + (blockIdx.y*blockDim.y + threadIdx.y)] = ABSMACRO(class1_score-class2_score);
	secondary[(blockIdx.x*blockDim.x + threadIdx.x)*m + (blockIdx.y*blockDim.y + threadIdx.y)] = ABSMACRO(class1_rank-class2_rank);
	lower[(blockIdx.x*blockDim.x + threadIdx.x)*m + (blockIdx.y*blockDim.y + threadIdx.y)] = minimum;
	upper[(blockIdx.x*blockDim.x + threadIdx.x)*m + (blockIdx.y*blockDim.y + threadIdx.y)] = maximum;
	vote[(blockIdx.x*blockDim.x + threadIdx.x)*m + (blockIdx.y*blockDim.y + threadIdx.y)] = ABSBINARYMACRO(class1_score-class2_score);

}
*/

