/*=================================================================
 *
 *  Kernels.h
 *  Author: Andrew Magis
 *  All of the GPU kernels for the tsp_cuda program
 *
 *=================================================================*/

 #ifndef _KERNELS_H
 #define _KERNELS_H
 
void DisplayDeviceProperties(int device) {

    cudaDeviceProp deviceProp;
    memset(&deviceProp, 0, sizeof (deviceProp));
	
	printf("-----\n");
	
    if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, device)) {
		printf("Device Name\t\t\t\t%s\n", deviceProp.name);
		printf("Total Global Memory\t\t\t%ld KB\n",deviceProp.totalGlobalMem / 1024);
		printf("Maximum threads per block\t\t%d\n", deviceProp.maxThreadsPerBlock);
		
    } else {
        printf("\n%s", cudaGetErrorString(cudaGetLastError()));
    }
	
	printf("------\n");				
		
}

/*
void availableMemory(int dev, unsigned long int necessary_gpu_memory){

    size_t available_gpu_memory, total_gpu_memory;
    CUcontext ctx;
    cuInit(dev);

    cuCtxCreate(&ctx, 0, dev);
    cuMemGetInfo(&available_gpu_memory, &total_gpu_memory);
    cuCtxDetach(ctx);
    if(necessary_gpu_memory >available_gpu_memory   ){
        printf("Not enough memory on the GPU to run this calculation\n");
        printf "Necessary: %ld Available: %ld\n", necessary_gpu_memory, available_gpu_memory);
        exit(1);
    }
}*/

#define TSP_THREADS 16
#define TST_THREADS 8
#define REDUCTION_THREADS 128
#define ABSMACRO(X) ( ((X)<0) ? (-(X)) : (X) )
#define ABSBINARYMACRO(X) ( ((X)<0) ? (0) : (1) )
#define MINMACRO(X,Y) ( ((X)<(Y)) ? (X) : (Y) )
#define MAXMACRO(X,Y) ( ((X)>(Y)) ? (X) : (Y) )

__global__ void tspKernel(float *d_class1, float *d_class2, unsigned int n1, unsigned int n2, unsigned int m, float *primary, float *secondary, int *vote) {
	
    float class1_score = 0.f;
	float class2_score = 0.f;
	float class1_rank = 0.f;
	float class2_rank = 0.f;	
	float n1_invert = __fdividef(1.f, (float)n1);
	float n2_invert = __fdividef(1.f, (float)n2);
	
	//We are only building a diagonal matrix here, so return if I am part of the diagonal
	//or below the diagonal
	if ((blockIdx.x*blockDim.x+threadIdx.x) > (blockIdx.y*blockDim.y+threadIdx.y)) {
	
		//Pointers to correct memory location for class1
		float *data1 = &d_class1[(blockIdx.x*blockDim.x + threadIdx.x)];
		float *data2 = &d_class1[(blockIdx.y*blockDim.y + threadIdx.y)];

		for (int i = 0; i < n1*m; i+=m) {
			class1_score += signbit(data1[i]-data2[i]);
			class1_rank += (float)(data1[i]-data2[i]);
		}		
		class1_score = class1_score * n1_invert;
		class1_rank = class1_rank * n1_invert;

		//Pointers to correct memory location for class2
		data1 =  &d_class2[(blockIdx.x*blockDim.x + threadIdx.x)];
		data2 =  &d_class2[(blockIdx.y*blockDim.y + threadIdx.y)];
	
		for (int i = 0; i < n2*m; i+=m) {
			class2_score += signbit(data1[i]-data2[i]);
			class2_rank += (float)(data1[i]-data2[i]);
		}	
		class2_score = class2_score * n2_invert;
		class2_rank = class2_rank * n2_invert;
		
	}
	
	//Write the result to global memory
	primary[(blockIdx.x*blockDim.x + threadIdx.x)*m + (blockIdx.y*blockDim.y + threadIdx.y)] = ABSMACRO(class1_score-class2_score);
	secondary[(blockIdx.x*blockDim.x + threadIdx.x)*m + (blockIdx.y*blockDim.y + threadIdx.y)] = ABSMACRO(class1_rank-class2_rank);
	vote[(blockIdx.x*blockDim.x + threadIdx.x)*m + (blockIdx.y*blockDim.y + threadIdx.y)] = ABSBINARYMACRO(class1_score-class2_score);

}

//Kernel running on the GPU
__global__ void tstKernel(float *d_class1, float *d_class2, unsigned int n1, unsigned int n2, unsigned int m, unsigned int zcoord, float *d_s1) {
		
	//Declare shared memory variables and zero them out
	__shared__ float sclass1_scores[6*TST_THREADS*TST_THREADS];
	__shared__ float sclass2_scores[6*TST_THREADS*TST_THREADS];
	float *class1_scores = &sclass1_scores[6*(threadIdx.x*TST_THREADS+threadIdx.y)];
	float *class2_scores = &sclass2_scores[6*(threadIdx.x*TST_THREADS+threadIdx.y)];
	
	#pragma unroll
	for (int i = 0; i < 6; i++) {
		class1_scores[i] = 0.f;
		class2_scores[i] = 0.f;
	}
	
	//Pre-calculate the inverse of the two class lengths 
	float n1inverse = 1.f / (float)n1;
	float n2inverse = 1.f / (float)n2;
	
	//Shared memory array for each thread to store its own data
	__shared__ float stemp[6*TST_THREADS*TST_THREADS];
	float *temp = &stemp[6*(threadIdx.x*TST_THREADS+threadIdx.y)];
		
	//We are only building a diagonal matrix here, so return if I am part of the diagonal
	//or below the diagonal
	if (((blockIdx.x*blockDim.x+threadIdx.x) > (blockIdx.y*blockDim.y+threadIdx.y)) &&
  	   ((blockIdx.y*blockDim.y+threadIdx.y) > zcoord)) {
	
		//Pointers to correct memory location for class1
		float *data1 = &d_class1[(blockIdx.x*blockDim.x + threadIdx.x)];
		float *data2 = &d_class1[(blockIdx.y*blockDim.y + threadIdx.y)];
		float *data3 = &d_class1[zcoord];

		//Registers to read from shared memory
		float sdata1, sdata2, sdata3;
		
		for (int i = 0; i < n1*m; i+=m) {
		
			//Set temp array to 0
			#pragma unroll
			for (int j = 0; j < 6; j++) {
				temp[j] = 0.f;
			}		
			float icount = 0.f;
		
			//Copy all the data into registers first
			sdata1 = data1[i]; sdata2 = data2[i]; sdata3 = data3[i];
		
			if ((sdata1 <= sdata2) && (sdata2 <= sdata3)) {
				temp[0] = 1.f;
				icount += 1.f;
			}
			if ((sdata1 <= sdata3) && (sdata3 <= sdata2)) {
				temp[1] = 1.f;
				icount += 1.f;
			}
			if ((sdata2 <= sdata1) && (sdata1 <= sdata3)) {
				temp[2] = 1.f;
				icount += 1.f;
			}
			if ((sdata2 <= sdata3) && (sdata3 <= sdata1)) {
				temp[3] = 1.f;
				icount += 1.f;
			}
			if ((sdata3 <= sdata1) && (sdata1 <= sdata2)) {
				temp[4] = 1.f;
				icount += 1.f;
			}
			if ((sdata3 <= sdata2) && (sdata2 <= sdata1)) {
				temp[5] = 1.f;
				icount += 1.f;
			}			
			
			//After we have computed all cases, if there was a tie, 
			//divide (won't happen very often)
			if (icount > 1.f) {
				#pragma unroll
				for (int j = 0; j < 6; j++) {
					temp[j] = __fdividef(temp[j], icount);
				}		
			}
			
			//Now add the result for each case to the final scores
			#pragma unroll
			for (int j = 0; j < 6; j++) {
				class1_scores[j] += temp[j];
			}
		}
		
		//At the end, scale the class1 scores by the number of elements
		#pragma unroll
		for (int i = 0; i < 6; i++) {
			class1_scores[i] *= n1inverse;
		}

		//Pointers to correct memory location for class2
		data1 = &d_class2[(blockIdx.x*blockDim.x + threadIdx.x)];
		data2 = &d_class2[(blockIdx.y*blockDim.y + threadIdx.y)];
		data3 = &d_class2[zcoord];
		
		for (int i = 0; i < n2*m; i+=m) {
		
			//Set temp array to 0
			#pragma unroll
			for (int j = 0; j < 6; j++) {
				temp[j] = 0.f;
			}		
			float icount = 0.f;
		
			//Copy all the data into registers first
			sdata1 = data1[i]; sdata2 = data2[i]; sdata3 = data3[i];
		
			if ((sdata1 <= sdata2) && (sdata2 <= sdata3)) {
				temp[0] = 1.f;
				icount += 1.f;
			}
			if ((sdata1 <= sdata3) && (sdata3 <= sdata2)) {
				temp[1] = 1.f;
				icount += 1.f;
			}
			if ((sdata2 <= sdata1) && (sdata1 <= sdata3)) {
				temp[2] = 1.f;
				icount += 1.f;
			}
			if ((sdata2 <= sdata3) && (sdata3 <= sdata1)) {
				temp[3] = 1.f;
				icount += 1.f;
			}
			if ((sdata3 <= sdata1) && (sdata1 <= sdata2)) {
				temp[4] = 1.f;
				icount += 1.f;
			}
			if ((sdata3 <= sdata2) && (sdata2 <= sdata1)) {
				temp[5] = 1.f;
				icount += 1.f;
			}			
			
			//After we have computed all cases, if there was a tie, 
			//divide (won't happen very often)
			if (icount > 1.f) {
				#pragma unroll
				for (int j = 0; j < 6; j++) {
					temp[j] = __fdividef(temp[j], icount);
				}		
			}
			
			//Now add the result for each case to the final scores
			#pragma unroll
			for (int j = 0; j < 6; j++) {
				class2_scores[j] += temp[j];
			}
		}
		
		//At the end, scale the class1 scores by the number of elements
		#pragma unroll
		for (int i = 0; i < 6; i++) {
			class2_scores[i] *= n2inverse;
		}
	}

	//Finally, sum the result
	float sum = 0.f;
	#pragma unroll	
	for (int i = 0; i < 6; i++) {
		sum += (float)ABSMACRO(class1_scores[i]-class2_scores[i]);
	}
	
	//Write the result to global memory
	d_s1[(blockIdx.x*blockDim.x + threadIdx.x)*m + (blockIdx.y*blockDim.y + threadIdx.y)] = sum;
}

__global__ void maxKernel(float *d_tsp, unsigned int m, unsigned int m1, float *maxValue, unsigned int *maxIndex, float *d_baddata) {

    __shared__ float sdata[REDUCTION_THREADS];
	__shared__ float sIndex[REDUCTION_THREADS];
	float s_maxValue = -1e-6;
	unsigned int s_index = 0;
	
	if (d_baddata[blockIdx.x] != 0) {
        maxValue[blockIdx.x] = 0.f;
        maxIndex[blockIdx.x] = 0.f;
		return;
 	}

	float *g_idata;
	for (unsigned int i = 0; i < m; i+=REDUCTION_THREADS) {
	
		//Set shared memory to be zero
		sdata[threadIdx.x] = 0.f;
		sIndex[threadIdx.x] = 0.f;
	
		// Go to correct loation in memory 
		g_idata = d_tsp + m*blockIdx.x + i;
		
		//Check to see if we will overshoot the actual data
		int WA = m-i > REDUCTION_THREADS ? REDUCTION_THREADS : m-i;
		
		if (threadIdx.x < WA) {
			sdata[threadIdx.x] = g_idata[threadIdx.x];
			sIndex[threadIdx.x] = m1*blockIdx.x + i + threadIdx.x;
		}
		__syncthreads();
			
		// do reduction in shared mem
		for(unsigned int s=blockDim.x/2; s>0; s>>=1) {
			if (threadIdx.x < s) {
				if (sdata[threadIdx.x + s] > sdata[threadIdx.x]) {
					sdata[threadIdx.x] = sdata[threadIdx.x + s];
					sIndex[threadIdx.x] = sIndex[threadIdx.x + s];
				}
			}
			__syncthreads();
		}
		
		// Keep track of largest element of this round
		if (threadIdx.x == 0) {
			if (sdata[0] > s_maxValue) {
				s_maxValue = sdata[0];
				s_index = sIndex[0];
			}
		}
	}
	
	if (threadIdx.x == 0) {
		maxValue[blockIdx.x] = s_maxValue;
		maxIndex[blockIdx.x] = s_index;
	}
}

__global__ void clearKernel(float *d_tsp, unsigned int m, unsigned int row, unsigned int col, float *d_baddata) {

	for (unsigned int i = 0; i < m; i+=REDUCTION_THREADS) {
		
		// Go to correct loation in memory 
		float *col_loc = d_tsp + m*col + i + threadIdx.x;
		float *row_loc = d_tsp + m*(threadIdx.x+i) + row;
		
		//Check to see if we will overshoot the actual data
		int WA = m-i > REDUCTION_THREADS ? REDUCTION_THREADS : m-i;
		
		if (threadIdx.x < WA) {
			*col_loc = 0.f;
			*row_loc = 0.f;
		}
		__syncthreads();
	}
	d_baddata[col] = 1.f;
}

#endif
