Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +4 −0 Original line number Diff line number Diff line Loading @@ -138,7 +138,11 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, <<<<<<< HEAD TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 ); ======= TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +283 −1 Original line number Diff line number Diff line Loading @@ -187,6 +187,7 @@ solve( const MeshPointer& mesh, { // TODO: CUDA code #ifdef HAVE_CUDA <<<<<<< HEAD TNL_CHECK_CUDA_DEVICE; const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); Loading Loading @@ -271,6 +272,86 @@ solve( const MeshPointer& mesh, /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ //pocIter ++; ======= const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); dim3 blockSize( cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY ); tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter; BlockIter.setSize( numBlocksX * numBlocksY ); BlockIter.setValue( 0 ); /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) ); for( int i = 0; i < numBlocksX*numBlocksY +1; i++) BlockIter[i] = 1;*/ int BlockIterD = 1; TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; BlockIterDevice.setSize( numBlocksX * numBlocksY ); BlockIterDevice.setValue( 1 ); /*int *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) ); cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/ int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; dBlock.setSize( nBlocks ); dBlock.setValue( 0 ); /*int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ while( BlockIterD ) { /*for( int i = 0; i < numBlocksX * numBlocksY; i++ ) BlockIter[ i ] = false;*/ CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; BlockIter = BlockIterDevice; //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost); GetNeighbours( BlockIter, numBlocksX, numBlocksY ); BlockIterDevice = BlockIter; //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } /*cudaFree( BlockIterDevice ); cudaFree( dBlock ); delete BlockIter;*/ cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; interfaceMap = *interfaceMapPtr; #endif >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; Loading @@ -285,6 +366,7 @@ solve( const MeshPointer& mesh, } aux.save("aux-final.tnl"); } <<<<<<< HEAD #ifdef HAVE_CUDA template < typename Index > Loading Loading @@ -318,10 +400,60 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index } } ======= template < typename Index > void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY ) { TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom; BlockIterPom.setSize( numBlockX * numBlockY ); BlockIterPom.setValue( 0 ); /*int* BlockIterPom; BlockIterPom = new int[numBlockX * numBlockY];*/ /*for(int i = 0; i < numBlockX * numBlockY; i++) BlockIterPom[ i ] = 0;*/ for(int i = 0; i < numBlockX * numBlockY; i++) { if( BlockIter[ i ] ) { // i = k*numBlockY + m; int m=0, k=0; m = i%numBlockY; k = i/numBlockY; if( k > 0 && numBlockY > 1 ) BlockIterPom[i - numBlockX] = 1; if( k < numBlockY-1 && numBlockY > 1 ) BlockIterPom[i + numBlockX] = 1; if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 ) BlockIterPom[ i+1 ] = 1; if( m <= numBlockX -1 && m > 0 && numBlockX > 1 ) BlockIterPom[ i-1 ] = 1; } } for(int i = 0; i < numBlockX * numBlockY; i++ ){ /// if( !BlockIter[ i ] ) BlockIter[ i ] = BlockIterPom[ i ]; /// else /// BlockIter[ i ] = 0; } /*for( int i = numBlockX-1; i > -1; i-- ) { for( int j = 0; j< numBlockY; j++ ) std::cout << BlockIter[ i*numBlockY + j ]; std::cout << std::endl; } std::cout << std::endl;*/ //delete[] BlockIterPom; } #ifdef HAVE_CUDA >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ) { <<<<<<< HEAD int i = threadIdx.x; int blId = blockIdx.x; __shared__ volatile int sArray[ 512 ]; Loading Loading @@ -363,6 +495,57 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I if( i == 0 ) dBlock[ blId ] = sArray[ 0 ]; ======= int i = threadIdx.x; int blId = blockIdx.x; /*if ( i == 0 && blId == 0 ){ printf( "nBlocks = %d \n", nBlocks ); for( int j = nBlocks-1; j > -1 ; j--){ printf( "cislo = %d \n", BlockIterDevice[ j ] ); } }*/ __shared__ volatile int sArray[ 512 ]; sArray[ i ] = 0; if( blId * 512 + i < nBlocks ) sArray[ i ] = BlockIterDevice[ blId * 512 + i ]; __syncthreads(); if (blockDim.x == 1024) { if (i < 512) sArray[ i ] += sArray[ i + 512 ]; } __syncthreads(); if (blockDim.x >= 512) { if (i < 256) { sArray[ i ] += sArray[ i + 256 ]; } } __syncthreads(); if (blockDim.x >= 256) { if (i < 128) { sArray[ i ] += sArray[ i + 128 ]; } } __syncthreads(); if (blockDim.x >= 128) { if (i < 64) { sArray[ i ] += sArray[ i + 64 ]; } } __syncthreads(); if (i < 32 ) { if( blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ]; if( blockDim.x >= 32 ) sArray[ i ] += sArray[ i + 16 ]; if( blockDim.x >= 16 ) sArray[ i ] += sArray[ i + 8 ]; if( blockDim.x >= 8 ) sArray[ i ] += sArray[ i + 4 ]; if( blockDim.x >= 4 ) sArray[ i ] += sArray[ i + 2 ]; if( blockDim.x >= 2 ) sArray[ i ] += sArray[ i + 1 ]; } if( i == 0 ) dBlock[ blId ] = sArray[ 0 ]; >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } Loading @@ -371,7 +554,11 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, <<<<<<< HEAD TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne ) ======= TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf { int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; Loading Loading @@ -408,7 +595,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< int i = thri + blockDim.x*blIdx; int j = blockDim.y*blIdy + thrj; int currentIndex = thrj * blockDim.x + thri; if( BlockIterDevice[ blIdy * gridDim.x + blIdx] ) { //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; __shared__ volatile bool changed[16*16]; changed[ currentIndex ] = false; Loading Loading @@ -444,14 +632,24 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< sArray[thrj+1][0] = std::numeric_limits< Real >::max(); } <<<<<<< HEAD if( thri == 2 ) { if( dimY > (blIdy+1) * blockDim.y && thri+1 < xkolik && NE == 1 ) sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ]; else sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max(); ======= if( numOfBlockx - 1 == blIdx ) xkolik = dimX - (blIdx)*blockDim.x+1; if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*blockDim.y+1; //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } <<<<<<< HEAD if( thri == 3 ) { if( blIdy != 0 && thrj+1 < xkolik && NE == 1 ) Loading @@ -477,6 +675,20 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) { if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) ======= if(thri == 0 && thrj == 0 ) BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; if( thri == 0 ) { if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik ) sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; else sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max(); } if( thri == 1 ) >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf { changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); } Loading Loading @@ -512,6 +724,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< { if( currentIndex < 64 ) { <<<<<<< HEAD changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; } } Loading @@ -533,5 +746,74 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; } ======= __syncthreads(); changed[ currentIndex] = false; //calculation of update cell if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) { if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) { changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); } } __syncthreads(); //pyramid reduction if( blockDim.x*blockDim.y == 1024 ) { if( currentIndex < 512 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; } } __syncthreads(); if( blockDim.x*blockDim.y >= 512 ) { if( currentIndex < 256 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; } } __syncthreads(); if( blockDim.x*blockDim.y >= 256 ) { if( currentIndex < 128 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; } } __syncthreads(); if( blockDim.x*blockDim.y >= 128 ) { if( currentIndex < 64 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; } } __syncthreads(); if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU { if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; } if( changed[ 0 ] && thri == 0 && thrj == 0 ){ BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1; } __syncthreads(); } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; } /*if( thri == 0 && thrj == 0 ) printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/ >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } #endif src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +5 −0 Original line number Diff line number Diff line Loading @@ -280,12 +280,17 @@ solve( const MeshPointer& mesh, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); <<<<<<< HEAD cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; ======= //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize(); Loading Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +4 −0 Original line number Diff line number Diff line Loading @@ -138,7 +138,11 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, <<<<<<< HEAD TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 ); ======= TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +283 −1 Original line number Diff line number Diff line Loading @@ -187,6 +187,7 @@ solve( const MeshPointer& mesh, { // TODO: CUDA code #ifdef HAVE_CUDA <<<<<<< HEAD TNL_CHECK_CUDA_DEVICE; const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); Loading Loading @@ -271,6 +272,86 @@ solve( const MeshPointer& mesh, /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ //pocIter ++; ======= const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); dim3 blockSize( cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY ); tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter; BlockIter.setSize( numBlocksX * numBlocksY ); BlockIter.setValue( 0 ); /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) ); for( int i = 0; i < numBlocksX*numBlocksY +1; i++) BlockIter[i] = 1;*/ int BlockIterD = 1; TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; BlockIterDevice.setSize( numBlocksX * numBlocksY ); BlockIterDevice.setValue( 1 ); /*int *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) ); cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/ int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; dBlock.setSize( nBlocks ); dBlock.setValue( 0 ); /*int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ while( BlockIterD ) { /*for( int i = 0; i < numBlocksX * numBlocksY; i++ ) BlockIter[ i ] = false;*/ CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; BlockIter = BlockIterDevice; //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost); GetNeighbours( BlockIter, numBlocksX, numBlocksY ); BlockIterDevice = BlockIter; //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } /*cudaFree( BlockIterDevice ); cudaFree( dBlock ); delete BlockIter;*/ cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; interfaceMap = *interfaceMapPtr; #endif >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; Loading @@ -285,6 +366,7 @@ solve( const MeshPointer& mesh, } aux.save("aux-final.tnl"); } <<<<<<< HEAD #ifdef HAVE_CUDA template < typename Index > Loading Loading @@ -318,10 +400,60 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index } } ======= template < typename Index > void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY ) { TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom; BlockIterPom.setSize( numBlockX * numBlockY ); BlockIterPom.setValue( 0 ); /*int* BlockIterPom; BlockIterPom = new int[numBlockX * numBlockY];*/ /*for(int i = 0; i < numBlockX * numBlockY; i++) BlockIterPom[ i ] = 0;*/ for(int i = 0; i < numBlockX * numBlockY; i++) { if( BlockIter[ i ] ) { // i = k*numBlockY + m; int m=0, k=0; m = i%numBlockY; k = i/numBlockY; if( k > 0 && numBlockY > 1 ) BlockIterPom[i - numBlockX] = 1; if( k < numBlockY-1 && numBlockY > 1 ) BlockIterPom[i + numBlockX] = 1; if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 ) BlockIterPom[ i+1 ] = 1; if( m <= numBlockX -1 && m > 0 && numBlockX > 1 ) BlockIterPom[ i-1 ] = 1; } } for(int i = 0; i < numBlockX * numBlockY; i++ ){ /// if( !BlockIter[ i ] ) BlockIter[ i ] = BlockIterPom[ i ]; /// else /// BlockIter[ i ] = 0; } /*for( int i = numBlockX-1; i > -1; i-- ) { for( int j = 0; j< numBlockY; j++ ) std::cout << BlockIter[ i*numBlockY + j ]; std::cout << std::endl; } std::cout << std::endl;*/ //delete[] BlockIterPom; } #ifdef HAVE_CUDA >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ) { <<<<<<< HEAD int i = threadIdx.x; int blId = blockIdx.x; __shared__ volatile int sArray[ 512 ]; Loading Loading @@ -363,6 +495,57 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I if( i == 0 ) dBlock[ blId ] = sArray[ 0 ]; ======= int i = threadIdx.x; int blId = blockIdx.x; /*if ( i == 0 && blId == 0 ){ printf( "nBlocks = %d \n", nBlocks ); for( int j = nBlocks-1; j > -1 ; j--){ printf( "cislo = %d \n", BlockIterDevice[ j ] ); } }*/ __shared__ volatile int sArray[ 512 ]; sArray[ i ] = 0; if( blId * 512 + i < nBlocks ) sArray[ i ] = BlockIterDevice[ blId * 512 + i ]; __syncthreads(); if (blockDim.x == 1024) { if (i < 512) sArray[ i ] += sArray[ i + 512 ]; } __syncthreads(); if (blockDim.x >= 512) { if (i < 256) { sArray[ i ] += sArray[ i + 256 ]; } } __syncthreads(); if (blockDim.x >= 256) { if (i < 128) { sArray[ i ] += sArray[ i + 128 ]; } } __syncthreads(); if (blockDim.x >= 128) { if (i < 64) { sArray[ i ] += sArray[ i + 64 ]; } } __syncthreads(); if (i < 32 ) { if( blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ]; if( blockDim.x >= 32 ) sArray[ i ] += sArray[ i + 16 ]; if( blockDim.x >= 16 ) sArray[ i ] += sArray[ i + 8 ]; if( blockDim.x >= 8 ) sArray[ i ] += sArray[ i + 4 ]; if( blockDim.x >= 4 ) sArray[ i ] += sArray[ i + 2 ]; if( blockDim.x >= 2 ) sArray[ i ] += sArray[ i + 1 ]; } if( i == 0 ) dBlock[ blId ] = sArray[ 0 ]; >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } Loading @@ -371,7 +554,11 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, <<<<<<< HEAD TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne ) ======= TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf { int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; Loading Loading @@ -408,7 +595,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< int i = thri + blockDim.x*blIdx; int j = blockDim.y*blIdy + thrj; int currentIndex = thrj * blockDim.x + thri; if( BlockIterDevice[ blIdy * gridDim.x + blIdx] ) { //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; __shared__ volatile bool changed[16*16]; changed[ currentIndex ] = false; Loading Loading @@ -444,14 +632,24 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< sArray[thrj+1][0] = std::numeric_limits< Real >::max(); } <<<<<<< HEAD if( thri == 2 ) { if( dimY > (blIdy+1) * blockDim.y && thri+1 < xkolik && NE == 1 ) sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ]; else sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max(); ======= if( numOfBlockx - 1 == blIdx ) xkolik = dimX - (blIdx)*blockDim.x+1; if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*blockDim.y+1; //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } <<<<<<< HEAD if( thri == 3 ) { if( blIdy != 0 && thrj+1 < xkolik && NE == 1 ) Loading @@ -477,6 +675,20 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) { if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) ======= if(thri == 0 && thrj == 0 ) BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; if( thri == 0 ) { if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik ) sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; else sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max(); } if( thri == 1 ) >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf { changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); } Loading Loading @@ -512,6 +724,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< { if( currentIndex < 64 ) { <<<<<<< HEAD changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; } } Loading @@ -533,5 +746,74 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; } ======= __syncthreads(); changed[ currentIndex] = false; //calculation of update cell if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) { if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) { changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); } } __syncthreads(); //pyramid reduction if( blockDim.x*blockDim.y == 1024 ) { if( currentIndex < 512 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; } } __syncthreads(); if( blockDim.x*blockDim.y >= 512 ) { if( currentIndex < 256 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; } } __syncthreads(); if( blockDim.x*blockDim.y >= 256 ) { if( currentIndex < 128 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; } } __syncthreads(); if( blockDim.x*blockDim.y >= 128 ) { if( currentIndex < 64 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; } } __syncthreads(); if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU { if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; } if( changed[ 0 ] && thri == 0 && thrj == 0 ){ BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1; } __syncthreads(); } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; } /*if( thri == 0 && thrj == 0 ) printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/ >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } #endif
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +5 −0 Original line number Diff line number Diff line Loading @@ -280,12 +280,17 @@ solve( const MeshPointer& mesh, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); <<<<<<< HEAD cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; ======= //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); >>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize(); Loading