Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +2 −1 Original line number Diff line number Diff line Loading @@ -126,7 +126,8 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, bool *BlockIterDevice ); int *BlockIterDevice ); __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +71 −15 Original line number Diff line number Diff line Loading @@ -27,7 +27,7 @@ template< typename Real, typename Anisotropy > FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: FastSweepingMethod() : maxIterations( 2 ) : maxIterations( 1 ) { } Loading Loading @@ -246,29 +246,37 @@ solve( const MeshPointer& mesh, bool* BlockIter = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); bool *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ) ); int *BlockIterDevice; int BlockIterD = 1; while( BlockIter[ 0 ] ) cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) ); int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); while( BlockIterD ) { for( int i = 0; i < numBlocksX * numBlocksY; i++ ) BlockIter[ i ] = false; cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( bool ), cudaMemcpyHostToDevice); /*for( int i = 0; i < numBlocksX * numBlocksY; i++ ) BlockIter[ i ] = false;*/ CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ), cudaMemcpyDeviceToHost); for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ]; CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } delete[] BlockIter; cudaFree( BlockIterDevice ); cudaFree( dBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; Loading @@ -283,11 +291,58 @@ solve( const MeshPointer& mesh, } #ifdef HAVE_CUDA __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ) { int i = threadIdx.x; int blId = blockIdx.x; __shared__ volatile int sArray[ 512 ]; sArray[ i ] = false; if(blId * 1024 + i < nBlocks ) sArray[ i ] = BlockIterDevice[ blId * 1024 + i ]; if (blockDim.x * blockDim.y == 1024) { if (i < 512) sArray[ i ] += sArray[ i ]; } __syncthreads(); if (blockDim.x * blockDim.y >= 512) { if (i < 256) { sArray[ i ] += sArray[ i ]; } } if (blockDim.x * blockDim.y >= 256) { if (i < 128) { sArray[ i ] += sArray[ i + 128 ]; } } __syncthreads(); if (blockDim.x * blockDim.y >= 128) { if (i < 64) { sArray[ i ] += sArray[ i + 64 ]; } } __syncthreads(); if (i < 32 ) { if( blockDim.x * blockDim.y >= 64 ) sArray[ i ] += sArray[ i + 32 ]; if( blockDim.x * blockDim.y >= 32 ) sArray[ i ] += sArray[ i + 16 ]; if( blockDim.x * blockDim.y >= 16 ) sArray[ i ] += sArray[ i + 8 ]; if( blockDim.x * blockDim.y >= 8 ) sArray[ i ] += sArray[ i + 4 ]; if( blockDim.x * blockDim.y >= 4 ) sArray[ i ] += sArray[ i + 2 ]; if( blockDim.x * blockDim.y >= 2 ) sArray[ i ] += sArray[ i + 1 ]; } if( i == 0 ) dBlock[ blId ] = sArray[ 0 ]; } template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, bool *BlockIterDevice ) int *BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; Loading Loading @@ -331,6 +386,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*blockDim.y+1; BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; } __syncthreads(); Loading Loading @@ -360,7 +416,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( thri == 3 ) { if( blIdy != 0 && thri+1 < xkolik ) if( blIdy != 0 && thrj+1 < xkolik ) sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ]; else sArray[0][thrj+1] = TypeInfo< Real >::getMaxValue(); Loading Loading @@ -427,7 +483,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; } if( changed[ 0 ] && thri == 0 && thrj == 0 ) BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = true; BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1; __syncthreads(); } Loading Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +2 −1 Original line number Diff line number Diff line Loading @@ -126,7 +126,8 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, bool *BlockIterDevice ); int *BlockIterDevice ); __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +71 −15 Original line number Diff line number Diff line Loading @@ -27,7 +27,7 @@ template< typename Real, typename Anisotropy > FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: FastSweepingMethod() : maxIterations( 2 ) : maxIterations( 1 ) { } Loading Loading @@ -246,29 +246,37 @@ solve( const MeshPointer& mesh, bool* BlockIter = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); bool *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ) ); int *BlockIterDevice; int BlockIterD = 1; while( BlockIter[ 0 ] ) cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) ); int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); while( BlockIterD ) { for( int i = 0; i < numBlocksX * numBlocksY; i++ ) BlockIter[ i ] = false; cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( bool ), cudaMemcpyHostToDevice); /*for( int i = 0; i < numBlocksX * numBlocksY; i++ ) BlockIter[ i ] = false;*/ CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ), cudaMemcpyDeviceToHost); for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ]; CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } delete[] BlockIter; cudaFree( BlockIterDevice ); cudaFree( dBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; Loading @@ -283,11 +291,58 @@ solve( const MeshPointer& mesh, } #ifdef HAVE_CUDA __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ) { int i = threadIdx.x; int blId = blockIdx.x; __shared__ volatile int sArray[ 512 ]; sArray[ i ] = false; if(blId * 1024 + i < nBlocks ) sArray[ i ] = BlockIterDevice[ blId * 1024 + i ]; if (blockDim.x * blockDim.y == 1024) { if (i < 512) sArray[ i ] += sArray[ i ]; } __syncthreads(); if (blockDim.x * blockDim.y >= 512) { if (i < 256) { sArray[ i ] += sArray[ i ]; } } if (blockDim.x * blockDim.y >= 256) { if (i < 128) { sArray[ i ] += sArray[ i + 128 ]; } } __syncthreads(); if (blockDim.x * blockDim.y >= 128) { if (i < 64) { sArray[ i ] += sArray[ i + 64 ]; } } __syncthreads(); if (i < 32 ) { if( blockDim.x * blockDim.y >= 64 ) sArray[ i ] += sArray[ i + 32 ]; if( blockDim.x * blockDim.y >= 32 ) sArray[ i ] += sArray[ i + 16 ]; if( blockDim.x * blockDim.y >= 16 ) sArray[ i ] += sArray[ i + 8 ]; if( blockDim.x * blockDim.y >= 8 ) sArray[ i ] += sArray[ i + 4 ]; if( blockDim.x * blockDim.y >= 4 ) sArray[ i ] += sArray[ i + 2 ]; if( blockDim.x * blockDim.y >= 2 ) sArray[ i ] += sArray[ i + 1 ]; } if( i == 0 ) dBlock[ blId ] = sArray[ 0 ]; } template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, bool *BlockIterDevice ) int *BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; Loading Loading @@ -331,6 +386,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*blockDim.y+1; BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; } __syncthreads(); Loading Loading @@ -360,7 +416,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( thri == 3 ) { if( blIdy != 0 && thri+1 < xkolik ) if( blIdy != 0 && thrj+1 < xkolik ) sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ]; else sArray[0][thrj+1] = TypeInfo< Real >::getMaxValue(); Loading Loading @@ -427,7 +483,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; } if( changed[ 0 ] && thri == 0 && thrj == 0 ) BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = true; BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1; __syncthreads(); } Loading