From 4dc3933564a1692a80d56f84fb97b8f653861e8b Mon Sep 17 00:00:00 2001 From: Tomas Sobotik <sobotto4@fjfi.cvut.cz> Date: Sun, 24 Apr 2016 12:22:14 +0200 Subject: [PATCH] Fixed Parallel 3D solver on GPU --- .../tnlParallelEikonalSolver3D_impl.h | 64 +++++--- .../parallelGodunovEikonal3D_impl.h | 147 ++++++++---------- 2 files changed, 110 insertions(+), 101 deletions(-) diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h index 6dcae4f3f5..f8cc71933e 100644 --- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h +++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h @@ -25,7 +25,7 @@ template< typename SchemeHost, typename SchemeDevice, typename Device> tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver() { cout << "a" << endl; - this->device = tnlHostDevice; /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice --- vypocet na CPU + this->device = tnlCudaDevice; /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice --- vypocet na CPU #ifdef HAVE_CUDA if(this->device == tnlCudaDevice) @@ -101,7 +101,7 @@ bool tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: test(); VectorType* tmp = new VectorType[subgridValues.getSize()]; - bool containsCurve = false; + #ifdef HAVE_CUDA @@ -148,8 +148,12 @@ bool tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: if(this->device == tnlHostDevice) { +#ifdef HAVE_OPENMP +#pragma omp parallel for num_threads(4) schedule(dynamic) +#endif for(int i = 0; i < this->subgridValues.getSize(); i++) { + bool containsCurve = false; // cout << "Working on subgrid " << i <<" --- check 1" << endl; if(! tmp[i].setSize(this->n*this->n*this->n)) @@ -246,9 +250,9 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: { bool end = false; - while ((this->boundaryConditions.max() > 0 ) || !end) + while (/*(this->boundaryConditions.max() > 0 ) ||*/ !end) { - if(this->boundaryConditions.max() == 0 ) + if(this->boundaryConditions.max() == 0 || this->subgridValues.max() < 0) end=true; else end=false; @@ -1012,11 +1016,17 @@ __device__ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA3D( const int i ,tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a) { //int j = threadIdx.x + threadIdx.y * blockDim.x; - int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows - + (blockIdx.y) * this->n*this->n*this->gridCols - + (blockIdx.x) * this->n - + threadIdx.y * this->n*this->gridCols - + threadIdx.x; +// int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows +// + (blockIdx.y) * this->n*this->n*this->gridCols +// + (blockIdx.x) * this->n +// + threadIdx.y * this->n*this->gridCols +// + threadIdx.x; + + + int index = blockDim.x*blockIdx.x + threadIdx.x + + (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + + (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y; + //printf("i= %d,j= %d,th= %d\n",i,j,th); *a = caller->work_u_cuda[index]; //printf("Hi %f \n", *a); @@ -1029,11 +1039,15 @@ __device__ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA3D( const int i ,tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a) { // int j = threadIdx.x + threadIdx.y * blockDim.x; - int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows - + (blockIdx.y) * this->n*this->n*this->gridCols - + (blockIdx.x) * this->n - + threadIdx.y * this->n*this->gridCols - + threadIdx.x; +// int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows +// + (blockIdx.y) * this->n*this->n*this->gridCols +// + (blockIdx.x) * this->n +// + threadIdx.y * this->n*this->gridCols +// + threadIdx.x; + + int index = blockDim.x*blockIdx.x + threadIdx.x + + (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + + (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y; if( (fabs(caller->work_u_cuda[index]) > fabs(*a)) || (caller->unusedCell_cuda[index] == 1) ) { @@ -1055,11 +1069,15 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: // int j = threadIdx.x + threadIdx.y * blockDim.x; //printf("j = %d, u = %f\n", j,u); - int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows - + (blockIdx.y) * this->n*this->n*this->gridCols - + (blockIdx.x) * this->n - + threadIdx.y * this->n*this->gridCols - + threadIdx.x; +// int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows +// + (blockIdx.y) * this->n*this->n*this->gridCols +// + (blockIdx.x) * this->n +// + threadIdx.y * this->n*this->gridCols +// + threadIdx.x; + + int index = blockDim.x*blockIdx.x + threadIdx.x + + (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + + (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y; //printf("i= %d,j= %d,index= %d\n",i,j,index); if( (fabs(this->work_u_cuda[index]) > fabs(u)) || (this->unusedCell_cuda[index] == 1) ) @@ -1181,7 +1199,7 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: // } double finalTime = this->stopTime; __syncthreads(); -// if( boundaryCondition == 0 ) finalTime *= 2.0; + if( boundaryCondition == 0 ) finalTime *= 2.0; tnlGridEntity<MeshType, 3, tnlGridEntityNoStencilStorage > Entity(subMesh); tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighbourEntities(Entity); @@ -1210,7 +1228,7 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: if( time + sharedTau[l] > finalTime ) sharedTau[l] = finalTime - time; } - + __syncthreads(); if(l < 256) sharedTau[l] = Min(sharedTau[l],sharedTau[l+256]); __syncthreads(); if(l < 128) sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]); @@ -1387,6 +1405,8 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int> subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + (blockIdx.z + 1)*gridDim.x*gridDim.y); boundary_index = 4; } + __threadfence(); + if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX)) { cudaSolver->unusedCell_cuda[gid] = 0; @@ -1402,7 +1422,7 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int> if(threadIdx.x+threadIdx.y+threadIdx.z == 0) { - if(subgridValue == INT_MAX && newSubgridValue !=0) + if(subgridValue == INT_MAX && newSubgridValue != 0) cudaSolver->setSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y, -INT_MAX); cudaSolver->setBoundaryConditionCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y, 1 * boundary[0] + diff --git a/src/operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h b/src/operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h index ee2db1e2a4..3010b711ba 100644 --- a/src/operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h +++ b/src/operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h @@ -266,115 +266,104 @@ Real parallelGodunovEikonalScheme< tnlGrid< 3, MeshReal, Device, MeshIndex >, Re ) const { RealType signui; - if(boundaryCondition == 0) - signui = sign(u[cellIndex],this->epsilon); - else - signui = Sign(u[cellIndex]); + signui = sign(u[cellIndex], this->epsilon); - RealType xb = u[cellIndex]; - RealType xf = -u[cellIndex]; - RealType yb = u[cellIndex]; - RealType yf = -u[cellIndex]; - RealType zb = u[cellIndex]; - RealType zf = -u[cellIndex]; - RealType a,b,c,d; -// if(threadIdx.x+threadIdx.y+threadIdx.z == 0) -// printf("x = %d, y = %d, z = %d\n",mesh.getDimensions().x() - 1,mesh.getDimensions().y() - 1,mesh.getDimensions().z() - 1); + RealType xb = u[cellIndex]; + RealType xf = -u[cellIndex]; + RealType yb = u[cellIndex]; + RealType yf = -u[cellIndex]; + RealType zb = u[cellIndex]; + RealType zf = -u[cellIndex]; + RealType a,b,c,d; - if(coordinates.x() == mesh.getDimensions().x() - 1) - xf += u[neighbourEntities.template getEntityIndex< -1, 0, 0 >()]; - else - xf += u[neighbourEntities.template getEntityIndex< 1, 0, 0 >()]; + if(coordinates.x() == mesh.getDimensions().x() - 1) + xf += u[neighbourEntities.template getEntityIndex< -1, 0, 0 >()]; + else + xf += u[neighbourEntities.template getEntityIndex< 1, 0, 0 >()]; - if(coordinates.x() == 0) - xb -= u[neighbourEntities.template getEntityIndex< 1, 0, 0 >()]; - else - xb -= u[neighbourEntities.template getEntityIndex< -1, 0, 0 >()]; - - if(coordinates.y() == mesh.getDimensions().y() - 1) - yf += u[neighbourEntities.template getEntityIndex< 0, -1, 0 >()]; - else - yf += u[neighbourEntities.template getEntityIndex< 0, 1, 0 >()]; - - if(coordinates.y() == 0) - yb -= u[neighbourEntities.template getEntityIndex< 0, 1, 0 >()]; - else - yb -= u[neighbourEntities.template getEntityIndex< 0, -1, 0 >()]; + if(coordinates.x() == 0) + xb -= u[neighbourEntities.template getEntityIndex< 1, 0, 0 >()]; + else + xb -= u[neighbourEntities.template getEntityIndex< -1, 0, 0 >()]; + if(coordinates.y() == mesh.getDimensions().y() - 1) + yf += u[neighbourEntities.template getEntityIndex< 0, -1, 0 >()]; + else + yf += u[neighbourEntities.template getEntityIndex< 0, 1, 0 >()]; - if(coordinates.z() == mesh.getDimensions().z() - 1) - zf += u[neighbourEntities.template getEntityIndex< 0, 0, -1 >()]; - else - zf += u[neighbourEntities.template getEntityIndex< 0, 0, 1 >()]; + if(coordinates.y() == 0) + yb -= u[neighbourEntities.template getEntityIndex< 0, 1, 0 >()]; + else + yb -= u[neighbourEntities.template getEntityIndex< 0, -1, 0 >()]; - if(coordinates.z() == 0) - zb -= u[neighbourEntities.template getEntityIndex< 0, 0, 1 >()]; - else - zb -= u[neighbourEntities.template getEntityIndex< 0, 0, -1 >()]; + if(coordinates.z() == mesh.getDimensions().z() - 1) + zf += u[neighbourEntities.template getEntityIndex< 0, 0, -1 >()]; + else + zf += u[neighbourEntities.template getEntityIndex< 0, 0, 1 >()]; - //xb *= ihx; - //xf *= ihx; - // yb *= ihy; - //yf *= ihy; + if(coordinates.z() == 0) + zb -= u[neighbourEntities.template getEntityIndex< 0, 0, 1 >()]; + else + zb -= u[neighbourEntities.template getEntityIndex< 0, 0, -1 >()]; - if(signui > 0.0) - { - xf = negativePart(xf); + if(signui > 0.0) + { + xf = negativePart(xf); - xb = positivePart(xb); + xb = positivePart(xb); - yf = negativePart(yf); + yf = negativePart(yf); - yb = positivePart(yb); + yb = positivePart(yb); - zf = negativePart(zf); + zf = negativePart(zf); - zb = positivePart(zb); + zb = positivePart(zb); - } - else if(signui < 0.0) - { + } + else if(signui < 0.0) + { - xb = negativePart(xb); + xb = negativePart(xb); - xf = positivePart(xf); + xf = positivePart(xf); - yb = negativePart(yb); + yb = negativePart(yb); - yf = positivePart(yf); + yf = positivePart(yf); - zb = negativePart(zb); + zb = negativePart(zb); - zf = positivePart(zf); - } + zf = positivePart(zf); + } - if(xb - xf > 0.0) - a = xb; - else - a = xf; + if(xb - xf > 0.0) + a = xb; + else + a = xf; - if(yb - yf > 0.0) - b = yb; - else - b = yf; + if(yb - yf > 0.0) + b = yb; + else + b = yf; - if(zb - zf > 0.0) - c = zb; - else - c = zf; + if(zb - zf > 0.0) + c = zb; + else + c = zf; - d = ( 1.0 - sqrt(a*a + b*b + c*c)*ihx ); + d = ( 1.0 - sqrt(a*a + b*b + c*c)*ihx ); -// d = 1.0 - sqrt(xf*xf + xb*xb + yf*yf + yb*yb + zf*zf + zb*zb)*ihx; /*upwind*/ + // d = 1.0 - sqrt(xf*xf + xb*xb + yf*yf + yb*yb + zf*zf + zb*zb)*ihx; /*upwind*/ -// if(Sign(d) > 0.0 ) -// return Sign(u[cellIndex])*d; -// else - return signui*d; + if(Sign(d) > 0.0 ) + return Sign(u[cellIndex])*d; + else + return signui*d; } -- GitLab