diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index 08ed947ed751935bab8a4ef31df8573619364b3e..0f45be71c87b69c6b76c283a5e8fa970c45b816b 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -138,7 +138,11 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, +<<<<<<< HEAD TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 ); +======= + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); +>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index e23148db5f278f8cba4a9241023e5dfff6f0c7be..4520eab0a45dceeb5c8a530ffc5140dc8954292e 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -187,6 +187,7 @@ solve( const MeshPointer& mesh, { // TODO: CUDA code #ifdef HAVE_CUDA +<<<<<<< HEAD TNL_CHECK_CUDA_DEVICE; const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); @@ -271,6 +272,86 @@ solve( const MeshPointer& mesh, /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ //pocIter ++; +======= + const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY ); + + tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; + + TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter; + BlockIter.setSize( numBlocksX * numBlocksY ); + BlockIter.setValue( 0 ); + /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) ); + for( int i = 0; i < numBlocksX*numBlocksY +1; i++) + BlockIter[i] = 1;*/ + + int BlockIterD = 1; + + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; + BlockIterDevice.setSize( numBlocksX * numBlocksY ); + BlockIterDevice.setValue( 1 ); + /*int *BlockIterDevice; + cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) ); + cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/ + + int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); + + TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; + dBlock.setSize( nBlocks ); + dBlock.setValue( 0 ); + /*int *dBlock; + cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ + + while( BlockIterD ) + { + /*for( int i = 0; i < numBlocksX * numBlocksY; i++ ) + BlockIter[ i ] = false;*/ + + CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template modifyData< Device>(), + BlockIterDevice ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + BlockIter = BlockIterDevice; + //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost); + GetNeighbours( BlockIter, numBlocksX, numBlocksY ); + + BlockIterDevice = BlockIter; + //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + + CudaParallelReduc<<< nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); + + /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) + BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ + + } + /*cudaFree( BlockIterDevice ); + cudaFree( dBlock ); + delete BlockIter;*/ + cudaDeviceSynchronize(); + + TNL_CHECK_CUDA_DEVICE; + + aux = *auxPtr; + interfaceMap = *interfaceMapPtr; +#endif +>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; @@ -285,6 +366,7 @@ solve( const MeshPointer& mesh, } aux.save("aux-final.tnl"); } +<<<<<<< HEAD #ifdef HAVE_CUDA template < typename Index > @@ -318,10 +400,60 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index } } +======= +template < typename Index > +void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY ) +{ + TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom; + BlockIterPom.setSize( numBlockX * numBlockY ); + BlockIterPom.setValue( 0 ); + /*int* BlockIterPom; + BlockIterPom = new int[numBlockX * numBlockY];*/ + /*for(int i = 0; i < numBlockX * numBlockY; i++) + BlockIterPom[ i ] = 0;*/ + for(int i = 0; i < numBlockX * numBlockY; i++) + { + + if( BlockIter[ i ] ) + { + // i = k*numBlockY + m; + int m=0, k=0; + m = i%numBlockY; + k = i/numBlockY; + if( k > 0 && numBlockY > 1 ) + BlockIterPom[i - numBlockX] = 1; + if( k < numBlockY-1 && numBlockY > 1 ) + BlockIterPom[i + numBlockX] = 1; + + if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 ) + BlockIterPom[ i+1 ] = 1; + if( m <= numBlockX -1 && m > 0 && numBlockX > 1 ) + BlockIterPom[ i-1 ] = 1; + } + } + for(int i = 0; i < numBlockX * numBlockY; i++ ){ +/// if( !BlockIter[ i ] ) + BlockIter[ i ] = BlockIterPom[ i ]; +/// else +/// BlockIter[ i ] = 0; + } + /*for( int i = numBlockX-1; i > -1; i-- ) + { + for( int j = 0; j< numBlockY; j++ ) + std::cout << BlockIter[ i*numBlockY + j ]; + std::cout << std::endl; + } + std::cout << std::endl;*/ + //delete[] BlockIterPom; +} + +#ifdef HAVE_CUDA +>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ) { +<<<<<<< HEAD int i = threadIdx.x; int blId = blockIdx.x; __shared__ volatile int sArray[ 512 ]; @@ -363,6 +495,57 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I if( i == 0 ) dBlock[ blId ] = sArray[ 0 ]; +======= + int i = threadIdx.x; + int blId = blockIdx.x; + /*if ( i == 0 && blId == 0 ){ + printf( "nBlocks = %d \n", nBlocks ); + for( int j = nBlocks-1; j > -1 ; j--){ + printf( "cislo = %d \n", BlockIterDevice[ j ] ); + } + }*/ + __shared__ volatile int sArray[ 512 ]; + sArray[ i ] = 0; + if( blId * 512 + i < nBlocks ) + sArray[ i ] = BlockIterDevice[ blId * 512 + i ]; + __syncthreads(); + + if (blockDim.x == 1024) { + if (i < 512) + sArray[ i ] += sArray[ i + 512 ]; + } + __syncthreads(); + if (blockDim.x >= 512) { + if (i < 256) { + sArray[ i ] += sArray[ i + 256 ]; + } + } + __syncthreads(); + if (blockDim.x >= 256) { + if (i < 128) { + sArray[ i ] += sArray[ i + 128 ]; + } + } + __syncthreads(); + if (blockDim.x >= 128) { + if (i < 64) { + sArray[ i ] += sArray[ i + 64 ]; + } + } + __syncthreads(); + if (i < 32 ) + { + if( blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ]; + if( blockDim.x >= 32 ) sArray[ i ] += sArray[ i + 16 ]; + if( blockDim.x >= 16 ) sArray[ i ] += sArray[ i + 8 ]; + if( blockDim.x >= 8 ) sArray[ i ] += sArray[ i + 4 ]; + if( blockDim.x >= 4 ) sArray[ i ] += sArray[ i + 2 ]; + if( blockDim.x >= 2 ) sArray[ i ] += sArray[ i + 1 ]; + } + + if( i == 0 ) + dBlock[ blId ] = sArray[ 0 ]; +>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } @@ -371,7 +554,11 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, +<<<<<<< HEAD TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne ) +======= + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) +>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf { int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; @@ -408,7 +595,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< int i = thri + blockDim.x*blIdx; int j = blockDim.y*blIdy + thrj; int currentIndex = thrj * blockDim.x + thri; - + if( BlockIterDevice[ blIdy * gridDim.x + blIdx] ) + { //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; __shared__ volatile bool changed[16*16]; changed[ currentIndex ] = false; @@ -444,14 +632,24 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< sArray[thrj+1][0] = std::numeric_limits< Real >::max(); } +<<<<<<< HEAD if( thri == 2 ) { if( dimY > (blIdy+1) * blockDim.y && thri+1 < xkolik && NE == 1 ) sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ]; else sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max(); +======= + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*blockDim.x+1; + + if( numOfBlocky -1 == blIdy ) + ykolik = dimY - (blIdy)*blockDim.y+1; + //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; +>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } +<<<<<<< HEAD if( thri == 3 ) { if( blIdy != 0 && thrj+1 < xkolik && NE == 1 ) @@ -477,6 +675,20 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) { if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) +======= + if(thri == 0 && thrj == 0 ) + BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; + + if( thri == 0 ) + { + if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik ) + sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; + else + sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max(); + } + + if( thri == 1 ) +>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf { changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); } @@ -512,6 +724,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< { if( currentIndex < 64 ) { +<<<<<<< HEAD changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; } } @@ -533,5 +746,74 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; } +======= + __syncthreads(); + + changed[ currentIndex] = false; + + //calculation of update cell + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + { + if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) + { + changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); + } + } + __syncthreads(); + + //pyramid reduction + if( blockDim.x*blockDim.y == 1024 ) + { + if( currentIndex < 512 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y >= 512 ) + { + if( currentIndex < 256 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y >= 256 ) + { + if( currentIndex < 128 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y >= 128 ) + { + if( currentIndex < 64 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; + } + } + __syncthreads(); + if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU + { + if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; + if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; + if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; + if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; + if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; + if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; + } + if( changed[ 0 ] && thri == 0 && thrj == 0 ){ + BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1; + } + __syncthreads(); + } + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) + aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; + } + /*if( thri == 0 && thrj == 0 ) + printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/ +>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } #endif diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 4daf9fc920eddd50fb25a4147865193c62149214..8d71bfe06d7c1eb63e63e0527ca58d8d4e95ad9f 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -280,12 +280,17 @@ solve( const MeshPointer& mesh, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); +<<<<<<< HEAD cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; +======= + //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); + //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); +>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize();