From bb0933c780bbc0af69eb56ef24b2f9d92a0ae6e1 Mon Sep 17 00:00:00 2001
From: Tomas Sobotik <sobotto4@fjfi.cvut.cz>
Date: Tue, 21 Apr 2015 12:38:37 +0200
Subject: [PATCH] More optimalizations

---
 .../tnlParallelEikonalSolver_impl.h           | 62 ++++++++++---------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h
index 7eaa71277b..66d2e84fe1 100644
--- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h
+++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h
@@ -1234,8 +1234,8 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x;
 	double u = cudaSolver->work_u_cuda[gid];
 	double u_cmp;
-	int subgridValue_cmp;
-	int boundary_index;
+	int subgridValue_cmp=INT_MAX;
+	int boundary_index=0;
 
 
 	if(threadIdx.x+threadIdx.y == 0)
@@ -1252,35 +1252,36 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 
 
 
-	if(		(threadIdx.x == 0 				&& blockIdx.x != 0				&& !(cudaSolver->currentStep & 1)) 		||
-			(threadIdx.y == 0 				&& blockIdx.y != 0 				&& (cudaSolver->currentStep & 1)) 		||
-			(threadIdx.x == blockDim.x - 1 	&& blockIdx.x != gridDim.x - 1 	&& !(cudaSolver->currentStep & 1)) 		||
-			(threadIdx.y == blockDim.y - 1 	&& blockIdx.y != gridDim.y - 1 	&& (cudaSolver->currentStep & 1)) 		)
+	if(		(threadIdx.x == 0 				/*				&& !(cudaSolver->currentStep & 1)*/) 		||
+			(threadIdx.y == 0 				 	/*			&& (cudaSolver->currentStep & 1)*/) 		||
+			(threadIdx.x == blockDim.x - 1 	 /*	&& !(cudaSolver->currentStep & 1)*/) 		||
+			(threadIdx.y == blockDim.y - 1 	 /*	&& (cudaSolver->currentStep & 1)*/) 		)
 	{
-		if(threadIdx.x == 0 /*&& !(cudaSolver->currentStep & 1)*/)
+		if(threadIdx.x == 0 && (blockIdx.x != 0)/* && !(cudaSolver->currentStep & 1)*/)
 		{
 			u_cmp = cudaSolver->work_u_cuda[gid - 1];
 			subgridValue_cmp = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x - 1);
 			boundary_index = 2;
 		}
-		if(threadIdx.y == 0 /*&& (cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y - 1)*gridDim.x + blockIdx.x);
-			boundary_index = 3;
-		}
-		if(threadIdx.x == blockDim.x - 1/* && !(cudaSolver->currentStep & 1)*/)
+
+		if(threadIdx.x == blockDim.x - 1 && (blockIdx.x != gridDim.x - 1)/* && !(cudaSolver->currentStep & 1)*/)
 		{
 			u_cmp = cudaSolver->work_u_cuda[gid + 1];
 			subgridValue_cmp = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x + 1);
 			boundary_index = 1;
 		}
-		if(threadIdx.y == blockDim.y - 1 /*&& (cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y + 1)*gridDim.x + blockIdx.x);
-			boundary_index = 0;
-		}
+//		if(threadIdx.y == 0 && (blockIdx.y != 0) && (cudaSolver->currentStep & 1))
+//		{
+//			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x];
+//			subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y - 1)*gridDim.x + blockIdx.x);
+//			boundary_index = 3;
+//		}
+//		if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1) && (cudaSolver->currentStep & 1))
+//		{
+//			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x];
+//			subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y + 1)*gridDim.x + blockIdx.x);
+//			boundary_index = 0;
+//		}
 
 		__threadfence();
 		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
@@ -1288,28 +1289,29 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 			cudaSolver->unusedCell_cuda[gid] = 0;
 			atomicMax(&newSubgridValue, INT_MAX);
 			atomicMax(&boundary[boundary_index], 1);
-			cudaSolver->work_u_cuda[gid] = u_cmp;  ////// unsure
+			cudaSolver->work_u_cuda[gid] = u_cmp;
 		}
 		__threadfence();
-		if(threadIdx.x == blockDim.x - 1/* && !(cudaSolver->currentStep & 1)*/)
+		if(threadIdx.y == 0 && (blockIdx.y != 0)/* && (cudaSolver->currentStep & 1)*/)
 		{
-			u_cmp = cudaSolver->work_u_cuda[gid + 1];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x + 1);
-			boundary_index = 1;
+			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x];
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y - 1)*gridDim.x + blockIdx.x);
+			boundary_index = 3;
 		}
-		if(threadIdx.x == 0 /*&& !(cudaSolver->currentStep & 1)*/)
+		if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1)/* && (cudaSolver->currentStep & 1)*/)
 		{
-			u_cmp = cudaSolver->work_u_cuda[gid - 1];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x - 1);
-			boundary_index = 2;
+			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x];
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y + 1)*gridDim.x + blockIdx.x);
+			boundary_index = 0;
 		}
+
 		__threadfence();
 		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
 		{
 			cudaSolver->unusedCell_cuda[gid] = 0;
 			atomicMax(&newSubgridValue, INT_MAX);
 			atomicMax(&boundary[boundary_index], 1);
-			cudaSolver->work_u_cuda[gid] = u_cmp;  ////// unsure
+			cudaSolver->work_u_cuda[gid] = u_cmp;
 		}
 	}
 	__syncthreads();
-- 
GitLab