From 3ef0d2200d84980656a20aa52870f459fe24d5cd Mon Sep 17 00:00:00 2001
From: Tomas Sobotik <sobotto4@fjfi.cvut.cz>
Date: Sun, 20 Mar 2016 18:48:19 +0100
Subject: [PATCH] Merging parallel solver 4

---
 .../tnlParallelEikonalSolver3D_impl.h         | 36 ++++++++++---------
 src/core/tnlCuda.cpp                          | 27 +++++++++++++-
 src/core/tnlCuda.cu                           | 24 +++++++++++++
 src/core/tnlCuda.h                            |  9 +++++
 4 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
index 9ffeb8e066..e2171e72dd 100644
--- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
+++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
@@ -635,8 +635,8 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
 			for( int j = 0; j<this->n*this->gridLevels; j++)
 			{
 				int l = j/this->n;
-
-				this->u0[i+(j-l)*mesh.getDimensions().x()*mesh.getDimensions().y()-k] = this->work_u[i+(j-l)*levelSize];
+				if(j - idealStretch  < 0)
+					this->u0[i+(j-l)*mesh.getDimensions().x()*mesh.getDimensions().y()-k] = this->work_u[i+(j-l)*levelSize];
 			}
 		}
 
@@ -1178,25 +1178,25 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
 			{
 				Ent.setCoordinates(tnlStaticVector<3,int>(i,0,k));
 			   	Ent.refresh();
-				u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.y) ;//+ 2*Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.y+this->n);
+				u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 0, 1, 0 >()*(threadIdx.y) ;//+ 2*Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.y+this->n);
 			}
 			else if(boundaryCondition == 1)
 			{
 				Ent.setCoordinates(tnlStaticVector<3,int>(i,blockDim.y - 1,k));
 			   	Ent.refresh();
-				u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(this->n - 1 - threadIdx.y) ;//+ Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(blockDim.y - threadIdx.y  - 1 +this->n);
+				u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 0, 1, 0 >()*(this->n - 1 - threadIdx.y) ;//+ Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(blockDim.y - threadIdx.y  - 1 +this->n);
 			}
 			else if(boundaryCondition == 32)
 			{
 				Ent.setCoordinates(tnlStaticVector<3,int>(i,j,0));
 			   	Ent.refresh();
-				u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.z);
+				u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 0, 0, 1 >()*(threadIdx.z);
 			}
 			else if(boundaryCondition == 16)
 			{
 				Ent.setCoordinates(tnlStaticVector<3,int>(i,j,blockDim.z - 1));
 			   	Ent.refresh();
-				u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(this->n - 1 - threadIdx.z) ;
+				u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 0, 0, 1 >()*(this->n - 1 - threadIdx.z) ;
 			}
 		}
 	}
@@ -1225,7 +1225,7 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
 	  if(computeFU)
 		  fu = schemeHost.getValueDev( this->subMesh, l, tnlStaticVector<3,int>(i,j,k)/*this->subMesh.getCellCoordinates(l)*/, u, time, boundaryCondition, neighbourEntities);
 
-	  sharedTau[l]=cfl/fabs(fu);
+	  sharedTau[l]=cfl/abs(fu);
 
       if(l == 0)
       {
@@ -1235,10 +1235,11 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
     	  if( time + sharedTau[l] > finalTime )		sharedTau[l] = finalTime - time;
 
 
-
+      if((blockDim.x == 8) && (l < 256))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+256]);
+      __syncthreads();
       if((blockDim.x == 8) && (l < 128))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]);
       __syncthreads();
-      if((blockDim.x == 8) && (l < 64))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+64]);
+      if(l < 64)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+64]);
       __syncthreads();
       if(l < 32)    							sharedTau[l] = Min(sharedTau[l],sharedTau[l+32]);
       if(l < 16)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+16]);
@@ -1308,12 +1309,14 @@ __global__
 void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/synchronizeCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
 {
 
-	__shared__ int boundary[4]; // north,east,west,south
+	__shared__ int boundary[6]; // north,east,west,south
 	__shared__ int subgridValue;
 	__shared__ int newSubgridValue;
 
 
-	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x + (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
+	int gid =  blockDim.x*blockIdx.x + threadIdx.x +
+			  (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x +
+			  (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
 	double u = cudaSolver->work_u_cuda[gid];
 	double u_cmp;
 	int subgridValue_cmp=INT_MAX;
@@ -1331,7 +1334,7 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>
 		boundary[4] = 0;
 		boundary[5] = 0;
 		newSubgridValue = 0;
-		printf("aaa z = %d, y = %d, x = %d\n",blockIdx.z,blockIdx.y,blockIdx.x);
+//		printf("aaa z = %d, y = %d, x = %d\n",blockIdx.z,blockIdx.y,blockIdx.x);
 	}
 	__syncthreads();
 
@@ -1388,19 +1391,20 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>
 			atomicMax(&newSubgridValue, INT_MAX);
 			atomicMax(&boundary[boundary_index], 1);
 			cudaSolver->work_u_cuda[gid] = u_cmp;
+			u=u_cmp;
 		}
 		__threadfence();
 
 		if(threadIdx.z == 0 && (blockIdx.z != 0)/* && (cudaSolver->currentStep & 1)*/)
 		{
 			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x*blockDim.y*gridDim.y];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y)*gridDim.x + blockIdx.x + (blockIdx.z-1)*gridDim.x*gridDim.y);
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + (blockIdx.z-1)*gridDim.x*gridDim.y);
 			boundary_index = 5;
 		}
 		if(threadIdx.z == blockDim.z - 1 && (blockIdx.z != gridDim.z - 1)/* && (cudaSolver->currentStep & 1)*/)
 		{
 			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x*blockDim.y*gridDim.y];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y)*gridDim.x + blockIdx.x + (blockIdx.z+1)*gridDim.x*gridDim.y);
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + (blockIdx.z+1)*gridDim.x*gridDim.y);
 			boundary_index = 4;
 		}
 		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
@@ -1444,7 +1448,7 @@ void synchronize2CUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Dev
 	int stepValue = cudaSolver->currentStep + 4;
 	if( cudaSolver->getSubgridValueCUDA3D(blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX )
 	{
-			printf("z = %d, y = %d, x = %d\n",blockIdx.z,blockIdx.y,blockIdx.x);
+
 			cudaSolver->setSubgridValueCUDA3D(blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x, stepValue);
 	}
 
@@ -1508,7 +1512,7 @@ void initRunCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device,
 	__shared__ int containsCurve;
 	if(l == 0)
 	{
-		printf("z = %d, y = %d, x = %d\n",blockIdx.z,blockIdx.y,blockIdx.x);
+//		printf("z = %d, y = %d, x = %d\n",blockIdx.z,blockIdx.y,blockIdx.x);
 		containsCurve = 0;
 	}
 
diff --git a/src/core/tnlCuda.cpp b/src/core/tnlCuda.cpp
index bd69fb2cab..20f6272a30 100644
--- a/src/core/tnlCuda.cpp
+++ b/src/core/tnlCuda.cpp
@@ -18,7 +18,9 @@
 #include <core/tnlCuda.h>
 #include <core/mfuncs.h>
 #include <tnlConfig.h>
- 
+#include <config/tnlConfigDescription.h>
+#include <config/tnlParameterContainer.h>
+
 tnlString tnlCuda :: getDeviceType()
 {
    return tnlString( "tnlCuda" );
@@ -46,3 +48,26 @@ int tnlCuda::getNumberOfGrids( const int blocks,
 
 }*/
 
+void tnlCuda::configSetup( tnlConfigDescription& config, const tnlString& prefix )
+{
+#ifdef HAVE_CUDA
+   config.addEntry<  int >( prefix + "cuda-device", "Choose CUDA device to run the computationon.", 0 );
+#else
+   config.addEntry<  int >( prefix + "cuda-device", "Choose CUDA device to run the computationon (not supported on this system).", 0 );
+#endif
+}
+      
+bool tnlCuda::setup( const tnlParameterContainer& parameters,
+                      const tnlString& prefix )
+{
+#ifdef HAVE_CUDA
+   int cudaDevice = parameters.getParameter< int >( "cuda-device" );
+   if( cudaSetDevice( cudaDevice ) != cudaSuccess )
+   {
+      std::cerr << "I cannot activate CUDA device number " << cudaDevice << "." << std::endl;
+      return false;
+   }
+#endif   
+   return true;
+}
+
diff --git a/src/core/tnlCuda.cu b/src/core/tnlCuda.cu
index 9178a3261e..113cba3522 100644
--- a/src/core/tnlCuda.cu
+++ b/src/core/tnlCuda.cu
@@ -16,6 +16,30 @@
  ***************************************************************************/
 
 #include <core/tnlCuda.h>
+#include <config/tnlConfigDescription.h>
+#include <config/tnlParameterContainer.h>
+
+
+/*void tnlCuda::configSetup( tnlConfigDescription& config, const tnlString& prefix )
+{
+#ifdef HAVE_CUDA
+   config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device.", 0 );
+#else
+   config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device (CUDA is not supported on this system).", 0 );   
+#endif   
+}
+      
+bool tnlCuda::setup( const tnlParameterContainer& parameters,
+                    const tnlString& prefix )
+{
+   int cudaDevice = parameters.getParameter< int >( prefix + "cuda-device" );
+#ifdef HAVE_CUDA
+    cudaSetDevice( cudaDevice );
+    checkCudaDevice;
+#endif
+   return true;
+}
+*/
 
 bool tnlCuda::checkDevice( const char* file_name, int line )
 {
diff --git a/src/core/tnlCuda.h b/src/core/tnlCuda.h
index 6889f3f109..3d5e6eb6eb 100644
--- a/src/core/tnlCuda.h
+++ b/src/core/tnlCuda.h
@@ -24,6 +24,9 @@
 #include <core/tnlString.h>
 #include <core/tnlAssert.h>
 
+class tnlConfigDescription;
+class tnlParameterContainer;
+
 #ifdef HAVE_CUDA
 #define __cuda_callable__ __device__ __host__
 #else
@@ -90,6 +93,12 @@ class tnlCuda
 #else
    static bool checkDevice( const char* file_name, int line ) { return false;};
 #endif
+   
+   static void configSetup( tnlConfigDescription& config, const tnlString& prefix = "" );
+      
+   static bool setup( const tnlParameterContainer& parameters,
+                      const tnlString& prefix = "" );
+
 
 };
 
-- 
GitLab