/*=================================================================
 *
 *  TST.cu
 *  Author: Andrew Magis
 *  Calculate TST scores on the GPU, does not return TST matrix
 *  Inputs: Class 1 data, Class 2 data,
 *  Outputs: sorted disjoint pairs, index i, index j, index k
 *
 *=================================================================*/

#ifndef _TST_H
#define	_TST_H 
 
#include <math.h>
#include <vector>
#include <algorithm>

//#define DEBUG

//Sort predicate function for sorting the vector of tsp scores by 
//first the primary then the secondary score
bool sort_pred_tst(const ScoreElement& left, const ScoreElement& right) {
	if (left.primary > right.primary) return true;
	return false;
}

std::vector<ScoreElement> TST(float *class1, float *class2, unsigned int m1, unsigned int n1, unsigned int m2, unsigned int n2) { 
	
#ifdef DEBUG	
	DisplayDeviceProperties(0);
	
	//Time the execution of this function
	cudaEvent_t start_event, stop_event;
	cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);
    cudaEventRecord(start_event, 0);
    cudaEventSynchronize(start_event);
	float time_run;
#endif		
			
	//m is the number of rows (genes)
	//n is the number of chips (samples)
	if (m1 != m2) {
		throw GeneralException("Number of genes for class 1 != class 2\n", "TST");
	}	
		
	//Create a padded m which is multiple of TST_THREADS
	unsigned int m;
	if (m1 % TST_THREADS == 0) {
		m = m1;
	} else {
		m = ((int)(m1 / TST_THREADS) + 1) * TST_THREADS;
	}
	
#ifdef DEBUG	
	printf("Class1 Ranks: [%d, %d] Class2 Ranks: [%d, %d]\n", m1, n1, m2, n2);
	printf("Thread Dimension: %d Padded length: %d\n", TST_THREADS, m);
#endif
	
	unsigned long int class1_size = m*n1 * sizeof(float);
	unsigned long int class2_size = m*n2 * sizeof(float);
	unsigned long int result_size_gpu = m*m * sizeof(float);
	
	//Allocate space on the GPU to store the input data
	float *d_class1, *d_class2, *d_s1;
	if ( cudaMalloc( (void**)&d_class1, class1_size ) != cudaSuccess )
       	throw GeneralException("Memory allocating failure on the GPU.", "TST");
	if ( cudaMalloc( (void**)&d_class2, class2_size )  != cudaSuccess )
        throw GeneralException("Memory allocating failure on the GPU.", "TST");
    if ( cudaMalloc( (void**)&d_s1, result_size_gpu )  != cudaSuccess )
        throw GeneralException("Memory allocating failure on the GPU.", "TST");
	
	//Reallocate space for the data with zeroed out padding
	float *h_class1, *h_class2, *h_s1;
	if (cudaMallocHost((void**)&h_class1, class1_size) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "TST");
	if (cudaMallocHost((void**)&h_class2, class2_size) != cudaSuccess)
		throw GeneralException("Memory allocating failure on the host.", "TST");
	if (cudaMallocHost((void**)&h_s1, result_size_gpu) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "TST");
						
	//Zero out this memory
	memset(h_class1, 0, class1_size);
	memset(h_class2, 0, class2_size);
	memset(h_s1, 0, result_size_gpu);
	
	//Copy over data to new padded array location
	float *temp = h_class1;
	float *mtemp = (float*)class1;
	for (int i = 0; i < n1; i++) {
		memcpy(temp, mtemp, m1*sizeof(float));
		mtemp += m1;
		temp += m;
	}	
	temp = h_class2;
	mtemp = (float*)class2;
	for (int i = 0; i < n2; i++) {
		memcpy(temp, mtemp, m1*sizeof(float));
		mtemp += m1;
		temp += m;
	}		
									
	//Copy data to the GPU
	if (cudaMemcpy(d_class1, h_class1, class1_size, cudaMemcpyHostToDevice) != cudaSuccess)
		throw GeneralException("Error copying memory to the GPU.", "TST");
	if (cudaMemcpy(d_class2, h_class2, class2_size, cudaMemcpyHostToDevice) != cudaSuccess)
		throw GeneralException("Error copying memory to the GPU.", "TST");
	
	//Allocate space for the maximum value calculations
	float *d_maxValues, *h_maxValues, *d_maxValue, *h_maxValue;
	unsigned int *d_maxIndices, *d_maxIndex, *h_maxIndices, *h_maxIndex;
	if ( cudaMalloc( (void**)&d_maxValues, m*sizeof(float))  != cudaSuccess )
       	throw GeneralException("Memory allocating failure on the GPU.", "TST");
	if ( cudaMalloc( (void**)&d_maxValue, sizeof(float))  != cudaSuccess )
       	throw GeneralException("Memory allocating failure on the GPU.", "TST");	
	if ( cudaMalloc( (void**)&d_maxIndices, m*sizeof(float))  != cudaSuccess )
       	throw GeneralException("Memory allocating failure on the GPU.", "TST");
	if ( cudaMalloc( (void**)&d_maxIndex, sizeof(float))  != cudaSuccess )
       	throw GeneralException("Memory allocating failure on the GPU.", "TST");	
	if (cudaMallocHost((void**)&h_maxValues, m*sizeof(float)) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "TST");	
	if (cudaMallocHost((void**)&h_maxValue, sizeof(float)) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "TST");	
	if (cudaMallocHost((void**)&h_maxIndices, m*sizeof(float)) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "TST");	
	if (cudaMallocHost((void**)&h_maxIndex, sizeof(float)) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "TST");	
	
	//Allocate space for the maximum value calculation speedup arrays
	float *h_baddata, *d_baddata, *h_baddata_single, *d_baddata_single;
 	if ( cudaMalloc( (void**)&d_baddata, m1*sizeof(float))  != cudaSuccess )
    	throw GeneralException("Memory allocating failure on the GPU.", "TST");
    if ( cudaMalloc( (void**)&d_baddata_single, sizeof(float))  != cudaSuccess )
        throw GeneralException("Memory allocating failure on the GPU.", "TST");	
    if (cudaMallocHost((void**)&h_baddata, m1*sizeof(float)) != cudaSuccess)
        throw GeneralException("Memory allocating failure on the host.", "TST");
    if (cudaMallocHost((void**)&h_baddata_single, sizeof(float)) != cudaSuccess)
        throw GeneralException("Memory allocating failure on the host.", "TST");
		
	//Set the dimension of the blocks and grids
	dim3 dimBlock(TST_THREADS, TST_THREADS);
	dim3 dimGrid(m/TST_THREADS, m/TST_THREADS);	
	
	//Set the dimension of the parallel reduction blocks and grids
	dim3 dimBlockMax(REDUCTION_THREADS, 1, 1);
	dim3 dimGridMax(m, 1, 1);	
	
#ifdef DEBUG	
	printf("Scheduling [%d %d] TST_THREADS in [%d %d] blocks for %d executions\n", TST_THREADS, TST_THREADS, m/TST_THREADS, m/TST_THREADS, m1);
#endif

	std::vector<ScoreElement> scores;
	
	//Get top three for each 2D matrix.  For the current purposes this is enough, 
	//since we are only interested in the maximum value at this time.  If a kTST is
	//developed, we would need a more rigorous method to choose disjoint triples.
	int stop = 3;

	//No streams here
	for (unsigned int zcoord = 0; zcoord < m1; zcoord++) {

		//Call the TST kernel
		tstKernel<<<dimGrid, dimBlock>>>(d_class1, d_class2, n1, n2, m, zcoord, d_s1);
		cudaThreadSynchronize();
	
		//Reset the bad data arrays and copy to the GPU	
		memset(h_baddata, 0, m1*sizeof(float));
		memset(h_baddata_single, 0, sizeof(float));
		if (cudaMemcpy(d_baddata, h_baddata, m1*sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess)
			throw GeneralException("Error copying memory to the GPU.", "TST");
		if (cudaMemcpy(d_baddata_single, h_baddata_single, sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess)
			throw GeneralException("Error copying memory to the GPU.", "TST");	
		
		//Find the set of maximum scores
		h_maxValue[0] = 1.f;
		for (int i = 0; i < (int)stop; i++) {
	
			maxKernel<<<dimGridMax, dimBlockMax>>>(d_s1, m, m1, d_maxValues, d_maxIndices, d_baddata);
			cudaThreadSynchronize();
			
			if (cudaMemcpy(h_maxValues, d_maxValues, m*sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) 
				throw GeneralException("Error copying memory from the GPU.", "TST");
			if (cudaMemcpy(h_maxIndices, d_maxIndices, m*sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) 
				throw GeneralException("Error copying memory from the GPU.", "TST");		
			
			maxKernel<<<1, dimBlockMax>>>(d_maxValues, m, m1, d_maxValue, d_maxIndex, d_baddata_single);
		
			if (cudaMemcpy(h_maxValue, d_maxValue, sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) 
				throw GeneralException("Error copying memory from the GPU.", "TST");
			if (cudaMemcpy(h_maxIndex, d_maxIndex, sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) 
				throw GeneralException("Error copying memory from the GPU.", "TST");	
			
			//If we can no longer find scores, exit the loop
			if (h_maxValues[h_maxIndex[0]] <= 0) {
				break;
			}
			
			//Convert index into row/column indices
			int index = h_maxIndices[h_maxIndex[0]];
			int col = (int)floor(index/m1);
			int row = index % m1;
		
			//Add these scores to the vector of scores
			ScoreElement score;
			score.row = row;
			score.col = col;
			score.z = zcoord;
			score.primary = h_maxValues[h_maxIndex[0]];
			score.secondary = -1;
			scores.push_back(score);

			//Clear this row and column
			clearKernel<<<1, dimBlockMax>>>(d_s1, m, row, col, d_baddata);	
			clearKernel<<<1, dimBlockMax>>>(d_s1, m, col, row, d_baddata);			
		
		}		

		//Make sure all copies are complete before continuing
		cudaThreadSynchronize();
	
	}

	//Resort the scores,in case there are ties of the primary score
	sort(scores.begin(), scores.end(), sort_pred_tst);
						

	
#ifdef DEBUG	
	cudaEventRecord(stop_event, 0);
	cudaEventSynchronize(stop_event); // block until the event is actually recorded
	cudaEventElapsedTime(&time_run, start_event, stop_event);
	printf("Finished running nvTST in %.6f seconds\n", time_run / 1000.0);
#endif		
		
	//Clear up memory on the device
	cudaFree(d_class1);
	cudaFree(d_class2);
	cudaFree(d_s1); 
	cudaFree(d_maxValues);
	cudaFree(d_maxValue);
	cudaFree(d_maxIndex);
	cudaFree(d_maxIndices);
	cudaFree(d_baddata);
	cudaFree(d_baddata_single);
	
	//Clear up memory on the host
	cudaFreeHost(h_class1);
	cudaFreeHost(h_class2);
	cudaFreeHost(h_s1); 
	cudaFreeHost(h_maxValues);
	cudaFreeHost(h_maxValue);	
	cudaFreeHost(h_maxIndices);
	cudaFreeHost(h_maxIndex);
	cudaFreeHost(h_baddata);
	cudaFreeHost(h_baddata_single);	
		
	//Return the scores
	return scores;
		
}

float CalculateTSTProbabilities(float *class1, float *class2, unsigned int m1, unsigned int n1, unsigned int m2, unsigned int n2, unsigned int row, unsigned int col, unsigned int z, std::vector<float> &class1_probs, std::vector<float> &class2_probs) {

	//Calculate the permutation probabilities for a particular TST and verify
	
	// Get pointer to row we are interested in in class 1
	float *sdata1 = &class1[row];
	float *sdata2 = &class1[col];
	float *sdata3 = &class1[z];
	float class1_scores[6];
	for (int i = 0; i < 6; i++) {
		class1_scores[i] = 0.f;
	}

	for (int j = 0; j < n1*m1; j+=m1) {
	
		float temp[6];
		for (int i = 0; i < 6; i++) {
			temp[i] = 0.f;
		}
		int icount = 0;	
		
		if ((sdata1[j] <= sdata2[j]) && (sdata2[j] <= sdata3[j])) {
			temp[0] = 1;
			icount = icount + 1;
		}
		if ((sdata1[j] <= sdata3[j]) && (sdata3[j] <= sdata2[j])) {
			temp[1] = 1;
			icount = icount + 1;
		}
		if ((sdata2[j] <= sdata1[j]) && (sdata1[j] <= sdata3[j])) {
			temp[2] = 1;
			icount = icount + 1;
		}
		if ((sdata2[j] <= sdata3[j]) && (sdata3[j] <= sdata1[j])) {
			temp[3] = 1;
			icount = icount + 1;
		}
		if ((sdata3[j] <= sdata1[j]) && (sdata1[j] <= sdata2[j])) {
			temp[4] = 1;
			icount = icount + 1;
		}
		if ((sdata3[j] <= sdata2[j]) && (sdata2[j] <= sdata1[j])) {
			temp[5] = 1;
			icount = icount + 1;
		}
		
		// Divide if there is a tie
		if (icount > 1) {
			for (int i = 0; i < 6; i++) {
				temp[i] /= (float)icount;
			}
		}
	
		// Add the results to the class1 scores
		for (int i = 0; i < 6; i++) {
			class1_scores[i] += temp[i];
		}
	
	}
	
	// Scale the permutation scores by the number of elements
	for (int i = 0; i < 6; i++) {
		class1_scores[i] /= (float)n1;
		class1_probs.push_back(class1_scores[i]);
	}
	
	// Get pointer to row we are interested in in class 1
	sdata1 = &class2[row];
	sdata2 = &class2[col];
	sdata3 = &class2[z];
	float class2_scores[6];
	for (int i = 0; i < 6; i++) {
		class2_scores[i] = 0.f;
	}

	for (int j = 0; j < n2*m1; j+=m1) {
	
		float temp[6];
		for (int i = 0; i < 6; i++) {
			temp[i] = 0.f;
		}
		int icount = 0;	
		
		if ((sdata1[j] <= sdata2[j]) && (sdata2[j] <= sdata3[j])) {
			temp[0] = 1;
			icount = icount + 1;
		}
		if ((sdata1[j] <= sdata3[j]) && (sdata3[j] <= sdata2[j])) {
			temp[1] = 1;
			icount = icount + 1;
		}
		if ((sdata2[j] <= sdata1[j]) && (sdata1[j] <= sdata3[j])) {
			temp[2] = 1;
			icount = icount + 1;
		}
		if ((sdata2[j] <= sdata3[j]) && (sdata3[j] <= sdata1[j])) {
			temp[3] = 1;
			icount = icount + 1;
		}
		if ((sdata3[j] <= sdata1[j]) && (sdata1[j] <= sdata2[j])) {
			temp[4] = 1;
			icount = icount + 1;
		}
		if ((sdata3[j] <= sdata2[j]) && (sdata2[j] <= sdata1[j])) {
			temp[5] = 1;
			icount = icount + 1;
		}
		
		// Divide if there is a tie
		if (icount > 1) {
			for (int i = 0; i < 6; i++) {
				temp[i] /= (float)icount;
			}
		}
	
		// Add the results to the class2 scores
		for (int i = 0; i < 6; i++) {
			class2_scores[i] += temp[i];
		}
	
	}
	
	// Scale the permutation scores by the number of elements
	for (int i = 0; i < 6; i++) {
		class2_scores[i] /= (float)n2;
		class2_probs.push_back(class2_scores[i]);	
	}
		
	// Now calculate the score.  
	float score = 0.f;
	for (int i = 0; i < 6; i++) {
		score += ABSMACRO(class1_scores[i]-class2_scores[i]);
	}
	score = (score + 2.f) / 4.f;
	return score;
}

#endif



