/*=================================================================
 *
 *  DisjointPair.h
 *  Author: Andrew Magis
 *  Get list of all disjoint pairs in a large square matrix above a certain
 *  cutoff or a fixed number N.  Sorts output by primary then secondary, if
 *  secondary is preset.  Secondary can simply be an empty set, then it is ignored
 *  Inputs: 2D primary scores, 2D secondary scores, cutoff, 0 for max Num or 1 for min cutoff
 *  Outputs: sorted disjoint pairs, index i, index j
 *
 *=================================================================*/

#ifndef _DISJOINTPAIR_H
#define	_DISJOINTPAIR_H

#include <math.h>
#include <vector>
#include <algorithm>

typedef struct {
	unsigned int row;
	unsigned int col;
	unsigned int z;
	float primary;
	float secondary;
}	ScoreElement;

//Sort predicate function for sorting the vector of tsp scores by 
//first the primary then the secondary score
bool sort_pred(const ScoreElement& left, const ScoreElement& right) {
	if (left.primary > right.primary) return true;
	else if (left.primary < right.primary) return false;
	else {
		if (left.secondary > right.secondary) return true;
		return false;
	}
}

std::vector<ScoreElement> DisjointPairKernel(float *primary, float *secondary, int m1, int n1, int stop) {
	
#ifdef DEBUG	
	DisplayDeviceProperties(0);
#endif
	
	//Time the execution of this function
	cudaEvent_t start_event, stop_event;
	cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);
    cudaEventRecord(start_event, 0);
    cudaEventSynchronize(start_event);
	float time_run;
		
	bool secondary_present = false;
	if (secondary != NULL) {
#ifdef DEBUG	
		printf("Secondary matrix is present\n");
#endif		
		secondary_present = true;
	}		
			
	//Create a padded m which is multiple of THREADS
	unsigned int m;
	if (m1 % TSP_THREADS == 0) {
		m = m1;
	} else {
		m = ((int)(m1 / TSP_THREADS) + 1) * TSP_THREADS;
	}
	
#ifdef DEBUG	
	printf("Matrix Size: [%d, %d] ", m1, n1);
	printf("Thread Dimension: %d Padded length: %d\n", TSP_THREADS, m);
#endif
	
	unsigned long int matrix_size = m*m * sizeof(float);
	
	//Allocate space on the GPU to store the input data
	float *d_matrix;
	if ( cudaMalloc( (void**)&d_matrix, matrix_size ) != cudaSuccess )
       	throw GeneralException("Memory allocating failure on the GPU.", "DisjointPair");		
	
	//Reallocate space for the data with zeroed out padding
	float *h_matrix;
	if (cudaMallocHost((void**)&h_matrix, matrix_size) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "DisjointPair");		
			
	//Zero out this memory
	memset(h_matrix, 0, matrix_size);
	
	//Copy over data to new padded array location
	float *temp = h_matrix;
	float *mtemp = (float*)primary;
	for (int i = 0; i < n1; i++) {
		memcpy(temp, mtemp, m1*sizeof(float));
		mtemp += m1;
		temp += m;
	}	
			
	//Copy data to the GPU
	if (cudaMemcpy(d_matrix, h_matrix, matrix_size, cudaMemcpyHostToDevice) != cudaSuccess)
		throw GeneralException("Error copying memory to the GPU.", "DisjointPair");

	//Allocate space on the GPU and host for some vectors to identify 
	//used rows and columns
	float *d_maxValues, *h_maxValues, *d_maxValue, *h_maxValue;
	unsigned int *d_maxIndices, *d_maxIndex, *h_maxIndices, *h_maxIndex;
	if ( cudaMalloc( (void**)&d_maxValues, m*sizeof(float))  != cudaSuccess )
       	throw GeneralException("Memory allocating failure on the GPU.", "DisjointPair");
	if ( cudaMalloc( (void**)&d_maxValue, sizeof(float))  != cudaSuccess )
       	throw GeneralException("Memory allocating failure on the GPU.", "DisjointPair");	
	if ( cudaMalloc( (void**)&d_maxIndices, m*sizeof(float))  != cudaSuccess )
       	throw GeneralException("Memory allocating failure on the GPU.", "DisjointPair");
	if ( cudaMalloc( (void**)&d_maxIndex, sizeof(float))  != cudaSuccess )
       	throw GeneralException("Memory allocating failure on the GPU.", "DisjointPair");	
	if (cudaMallocHost((void**)&h_maxValues, m*sizeof(float)) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "DisjointPair");	
	if (cudaMallocHost((void**)&h_maxValue, sizeof(float)) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "DisjointPair");	
	if (cudaMallocHost((void**)&h_maxIndices, m*sizeof(float)) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "DisjointPair");	
	if (cudaMallocHost((void**)&h_maxIndex, sizeof(float)) != cudaSuccess) 
		throw GeneralException("Memory allocating failure on the host.", "DisjointPair");	

	float *h_baddata, *d_baddata, *h_baddata_single, *d_baddata_single;
    if (cudaMallocHost((void**)&h_baddata, m1*sizeof(float)) != cudaSuccess)
        throw GeneralException("Memory allocating failure on the host.", "DisjointPair");
    if (cudaMallocHost((void**)&h_baddata_single, sizeof(float)) != cudaSuccess)
        throw GeneralException("Memory allocating failure on the host.", "DisjointPair");
	memset(h_baddata, 0, m1*sizeof(float));
	memset(h_baddata_single, 0, sizeof(float));
 	if ( cudaMalloc( (void**)&d_baddata, m1*sizeof(float))  != cudaSuccess )
    	throw GeneralException("Memory allocating failure on the GPU.", "DisjointPair");
    if ( cudaMalloc( (void**)&d_baddata_single, sizeof(float))  != cudaSuccess )
        throw GeneralException("Memory allocating failure on the GPU.", "DisjointPair");
    if (cudaMemcpy(d_baddata, h_baddata, m1*sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess)
        throw GeneralException("Error copying memory to the GPU.", "DisjointPair");
    if (cudaMemcpy(d_baddata_single, h_baddata_single, sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess)
        throw GeneralException("Error copying memory to the GPU.", "DisjointPair");
		
				
	dim3 dimBlockMax(REDUCTION_THREADS, 1, 1);
	dim3 dimGridMax(m, 1, 1);
	std::vector<ScoreElement> scores;	

	h_maxValue[0] = 1.f;	
	
	for (int z = 0; z < (int)stop; z++) {

		maxKernel<<<dimGridMax, dimBlockMax>>>(d_matrix, m, m1, d_maxValues, d_maxIndices, d_baddata);
		cudaThreadSynchronize();
		
		if (cudaMemcpy(h_maxValues, d_maxValues, m*sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) 
			throw GeneralException("Error copying memory from the GPU.", "DisjointPair");
		if (cudaMemcpy(h_maxIndices, d_maxIndices, m*sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) 
			throw GeneralException("Error copying memory from the GPU.", "DisjointPair");		
		
		maxKernel<<<1, dimBlockMax>>>(d_maxValues, m, m1, d_maxValue, d_maxIndex, d_baddata_single);
	
		if (cudaMemcpy(h_maxValue, d_maxValue, sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) 
			throw GeneralException("Error copying memory from the GPU.", "DisjointPair");
		if (cudaMemcpy(h_maxIndex, d_maxIndex, sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) 
			throw GeneralException("Error copying memory from the GPU.", "DisjointPair");	
		
		//If we can no longer find scores, exit the loop
		if (h_maxValues[h_maxIndex[0]] <= 0) {
			break;
		}
		
		//Convert index into row/column indices
		int index = h_maxIndices[h_maxIndex[0]];
		int col = (int)floor(index/m1);
		int row = index % m1;
	
		//Add these scores to the vector of scores
		ScoreElement score;
		score.row = row;
		score.col = col;
		score.primary = h_maxValues[h_maxIndex[0]];
		if (secondary_present) {
			score.secondary = secondary[index];
		} else {
			score.secondary = 0;
		}
		scores.push_back(score);

		//Clear this row and column
		clearKernel<<<1, dimBlockMax>>>(d_matrix, m, row, col, d_baddata);	
		clearKernel<<<1, dimBlockMax>>>(d_matrix, m, col, row, d_baddata);			
	
	}
	
	//Resort the scores,in case there are ties of the primary score
	std::sort(scores.begin(), scores.end(), sort_pred);
						
	cudaEventRecord(stop_event, 0);
	cudaEventSynchronize(stop_event); // block until the event is actually recorded
	cudaEventElapsedTime(&time_run, start_event, stop_event);
	
#ifdef DEBUG	
	printf("Finished getting max values in %.6f seconds\n", time_run / 1000.0);	
#endif		
		
	//Clear up memory on the device
	cudaFree(d_matrix);
	cudaFree(d_maxValues);
	cudaFree(d_maxValue);
	cudaFree(d_maxIndex);
	cudaFree(d_maxIndices);
	cudaFree(d_baddata);
	cudaFree(d_baddata_single);
	
	//Clear up memory on the host
	cudaFreeHost(h_matrix);
	cudaFreeHost(h_maxValues);
	cudaFreeHost(h_maxValue);	
	cudaFreeHost(h_maxIndices);
	cudaFreeHost(h_maxIndex);
	cudaFreeHost(h_baddata);
	cudaFreeHost(h_baddata_single);
	
	//Return the results
	return scores;
		
}

#endif
