/*=================================================================
 *
 *  TSP.cu
 *  Author: Andrew Magis
 *  Execute the TSP algorithm on the GPU
 *
 *
 *=================================================================*/

#ifndef _TSP_H
#define _TSP_H
 
std::vector<float*> TSP(float *class1, float *class2, unsigned int m1, unsigned int n1, unsigned int m2, unsigned int n2) { 
		
#ifdef DEBUG	
	DisplayDeviceProperties(0);
	
	//Time the execution of this function
	cudaEvent_t start_event, stop_event;
	cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);
    cudaEventRecord(start_event, 0);
    cudaEventSynchronize(start_event);
	float time_run;
#endif		
			
	//m is the number of rows (genes)
	//n is the number of chips (samples)
	if (m1 != m2) {
		throw GeneralException("Number of genes for class 1 != class 2\n", "TSP");
	}	
		
	//Create a padded m which is multiple of THREADS
	unsigned int m;
	if (m1 % TSP_THREADS == 0) {
		m = m1;
	} else {
		m = ((int)(m1 / TSP_THREADS) + 1) * TSP_THREADS;
	}
	
#ifdef DEBUG	
	printf("Class1 Ranks: [%d, %d] Class2 Ranks: [%d, %d]\n", m1, n1, m2, n2);
	printf("Thread Dimension: %d Padded length: %d\n", TSP_THREADS, m);
#endif
		
	unsigned long int class1_size = m*n1 * sizeof(float);
	unsigned long int class2_size = m*n2 * sizeof(float);
	unsigned long int result_size = m*m * sizeof(float);
	
	//Allocate space on the GPU to store the input data
	float *d_class1, *d_class2;
	if ( cudaMalloc( (void**)&d_class1, class1_size ) != cudaSuccess )
       	    throw GeneralException("Memory allocating failure on the GPU.", "TSP");
	if ( cudaMalloc( (void**)&d_class2, class2_size )  != cudaSuccess )
            throw GeneralException("Memory allocating failure on the GPU.", "TSP");	
			
	//Allocate space on the GPU to store the output data
	float *d_s1, *d_s2, *d_s5;	
    if ( cudaMalloc( (void**)&d_s1, result_size )  != cudaSuccess )
			throw GeneralException("Memory allocating failure on the GPU.", "TSP");
    if ( cudaMalloc( (void**)&d_s2, result_size )  != cudaSuccess )
			throw GeneralException("Memory allocating failure on the GPU.", "TSP");
	if ( cudaMalloc( (void**)&d_s5, result_size )  != cudaSuccess )
			throw GeneralException("Memory allocating failure on the GPU.", "TSP");	
		
	//Reallocate space for the data on the host with zeroed out padding
	float *h_class1, *h_class2, *h_s1, *h_s2, *h_s5;
	if (cudaMallocHost((void**)&h_class1, class1_size) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "TSP");
	if (cudaMallocHost((void**)&h_class2, class2_size) != cudaSuccess)
		throw GeneralException("Memory allocating failure on the host.", "TSP");
	if (cudaMallocHost((void**)&h_s1, result_size) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "TSP");
	if (cudaMallocHost((void**)&h_s2, result_size) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "TSP");
	if (cudaMallocHost((void**)&h_s5, result_size) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "TSP");			
		
	//Allocate space for the non-buffered output arrays
	float *output1 = new float[m1*m1];
	float *output2 = new float[m1*m1];
	float *output5 = new float[m1*m1];	
	
	//Zero out the memory on the host
	memset(h_class1, 0, class1_size);
	memset(h_class2, 0, class2_size);	
	
	//Copy over data to new padded array location on host
	float *temp = h_class1;
	float *mtemp = (float*)class1;
	for (int i = 0; i < n1; i++) {
		memcpy(temp, mtemp, m1*sizeof(float));
		mtemp += m1;
		temp += m;
	}	
	temp = h_class2;
	mtemp = (float*)class2;
	for (int i = 0; i < n2; i++) {
		memcpy(temp, mtemp, m1*sizeof(float));
		mtemp += m1;
		temp += m;
	}		
	
	//Copy data to the GPU
	if (cudaMemcpy(d_class1, h_class1, class1_size, cudaMemcpyHostToDevice) != cudaSuccess)
		throw GeneralException("Error copying memory to the GPU.", "TSP");
	if (cudaMemcpy(d_class2, h_class2, class2_size, cudaMemcpyHostToDevice) != cudaSuccess)
		throw GeneralException("Error copying memory to the GPU.", "TSP");
				
	//Set the dimension of the blocks and grids
	dim3 dimBlock(TSP_THREADS, TSP_THREADS);
	dim3 dimGrid(m/TSP_THREADS, m/TSP_THREADS);
	
#ifdef DEBUG	
	printf("Scheduling [%d %d] threads in [%d %d] blocks\n", TSP_THREADS, TSP_THREADS, m/TSP_THREADS, m/TSP_THREADS);
#endif

	tspKernel<<<dimGrid, dimBlock>>>(d_class1, d_class2, n1, n2, m, d_s1, d_s2, (int*)d_s5);
	cudaThreadSynchronize();
		
	//Copy the memory back
	if (cudaMemcpy(h_s1, d_s1, result_size, cudaMemcpyDeviceToHost) != cudaSuccess) 
		throw GeneralException("Error copying memory from the GPU.", "TSP");
	if (cudaMemcpy(h_s2, d_s2, result_size, cudaMemcpyDeviceToHost) != cudaSuccess) 
		throw GeneralException("Error copying memory from the GPU.", "TSP");	
	if (cudaMemcpy(h_s5, d_s5, result_size, cudaMemcpyDeviceToHost) != cudaSuccess) 
		throw GeneralException("Error copying memory from the GPU.", "TSP");	
			
	float *gpu_output1 = h_s1, *gpu_output2 = h_s2, *gpu_output5 = h_s5;
	float *matlab_output1 = (float*) output1;		
	float *matlab_output2 = (float*) output2;
	float *matlab_output5 = (float*) output5;
	
	//Finally, copy the padded array data into the output matrix
	for (int i = 0; i < m1; i++) {
		memcpy(matlab_output1, gpu_output1, m1*sizeof(float));
		memcpy(matlab_output2, gpu_output2, m1*sizeof(float));
		memcpy(matlab_output5, gpu_output5, m1*sizeof(float));			
		matlab_output1 += m1; matlab_output2 += m1; matlab_output5 += m1;
		gpu_output1 += m; gpu_output2 += m; gpu_output5 += m;
	}	

#ifdef DEBUG			
	cudaEventRecord(stop_event, 0);
	cudaEventSynchronize(stop_event); // block until the event is actually recorded
	cudaEventElapsedTime(&time_run, start_event, stop_event);
	printf("Finished running nvTSP in %.6f seconds\n", time_run / 1000.0);
	cudaEventRecord(start_event, 0);
    cudaEventSynchronize(start_event);	
#endif
	
	//Clear up memory on the device
	cudaFree(d_class1);
	cudaFree(d_class2);
	cudaFree(d_s1); 
	cudaFree(d_s2);
	cudaFree(d_s5);
	
	//Clear up memory on the host
	cudaFreeHost(h_class1);
	cudaFreeHost(h_class2);
	cudaFreeHost(h_s1); 
	cudaFreeHost(h_s2);
	cudaFreeHost(h_s5);
	
	std::vector<float*> pointers;
	pointers.push_back(output1); pointers.push_back(output2);
	pointers.push_back(output5);
	return pointers;
		
}

#endif
