Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +7 −2 Original line number Diff line number Diff line Loading @@ -113,6 +113,8 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double template < typename T1 > __cuda_callable__ void sortMinims( T1 pom[] ); template < typename Index > void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY ); #ifdef HAVE_CUDA template < typename Real, typename Device, typename Index > Loading @@ -130,8 +132,11 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, int *BlockIterDevice, int oddEvenBlock); __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ); TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ); /*template < typename Real, typename Device, typename Index > __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/ Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +125 −74 Original line number Diff line number Diff line Loading @@ -235,13 +235,6 @@ solve( const MeshPointer& mesh, { // TODO: CUDA code #ifdef HAVE_CUDA Real *dAux; cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) ); const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); Loading @@ -250,18 +243,30 @@ solve( const MeshPointer& mesh, tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; //aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 ); //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter; BlockIter.setSize( numBlocksX * numBlocksY ); BlockIter.setValue( 0 ); /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) ); for( int i = 0; i < numBlocksX*numBlocksY +1; i++) BlockIter[i] = 1;*/ int *BlockIterDevice; int BlockIterD = 1; TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; BlockIterDevice.setSize( numBlocksX * numBlocksY ); BlockIterDevice.setValue( 1 ); /*int *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) ); cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/ int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); int oddEvenBlock = 0; TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; dBlock.setSize( nBlocks ); dBlock.setValue( 0 ); /*int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ while( BlockIterD ) { /*for( int i = 0; i < numBlocksX * numBlocksY; i++ ) Loading @@ -270,89 +275,132 @@ solve( const MeshPointer& mesh, CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice, oddEvenBlock ); BlockIterDevice ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice, oddEvenBlock ); BlockIter = BlockIterDevice; //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost); GetNeighbours( BlockIter, numBlocksX, numBlocksY ); BlockIterDevice = BlockIter; //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; CudaParallelReduc<<< nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } //aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 ); cudaFree( dAux ); cudaFree( BlockIterDevice ); /*cudaFree( BlockIterDevice ); cudaFree( dBlock ); delete BlockIter;*/ cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; //aux = *auxPtr; //interfaceMap = *interfaceMapPtr; aux = *auxPtr; interfaceMap = *interfaceMapPtr; #endif } iteration++; } aux.save("aux-final.tnl"); } #ifdef HAVE_CUDA /*template < typename Real, typename Device, typename Index > __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a ) template < typename Index > void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY ) { TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom; BlockIterPom.setSize( numBlockX * numBlockY ); BlockIterPom.setValue( 0 ); /*int* BlockIterPom; BlockIterPom = new int[numBlockX * numBlockY];*/ /*for(int i = 0; i < numBlockX * numBlockY; i++) BlockIterPom[ i ] = 0;*/ for(int i = 0; i < numBlockX * numBlockY; i++) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 ) if( BlockIter[ i ] ) { dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ]; // i = k*numBlockY + m; int m=0, k=0; m = i%numBlockY; k = i/numBlockY; if( k > 0 && numBlockY > 1 ) BlockIterPom[i - numBlockX] = 1; if( k < numBlockY-1 && numBlockY > 1 ) BlockIterPom[i + numBlockX] = 1; if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 ) BlockIterPom[ i+1 ] = 1; if( m <= numBlockX -1 && m > 0 && numBlockX > 1 ) BlockIterPom[ i-1 ] = 1; } } for(int i = 0; i < numBlockX * numBlockY; i++ ){ /// if( !BlockIter[ i ] ) BlockIter[ i ] = BlockIterPom[ i ]; /// else /// BlockIter[ i ] = 0; } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 ) /*for( int i = numBlockX-1; i > -1; i-- ) { aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ]; for( int j = 0; j< numBlockY; j++ ) std::cout << BlockIter[ i*numBlockY + j ]; std::cout << std::endl; } std::cout << std::endl;*/ //delete[] BlockIterPom; } }*/ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ) #ifdef HAVE_CUDA template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ) { int i = threadIdx.x; int blId = blockIdx.x; /*if ( i == 0 && blId == 0 ){ printf( "nBlocks = %d \n", nBlocks ); for( int j = nBlocks-1; j > -1 ; j--){ printf( "cislo = %d \n", BlockIterDevice[ j ] ); } }*/ __shared__ volatile int sArray[ 512 ]; sArray[ i ] = false; if(blId * 1024 + i < nBlocks ) sArray[ i ] = BlockIterDevice[ blId * 1024 + i ]; sArray[ i ] = 0; if( blId * 512 + i < nBlocks ) sArray[ i ] = BlockIterDevice[ blId * 512 + i ]; __syncthreads(); if (blockDim.x * blockDim.y == 1024) { if (blockDim.x == 1024) { if (i < 512) sArray[ i ] += sArray[ i ]; sArray[ i ] += sArray[ i + 512 ]; } __syncthreads(); if (blockDim.x * blockDim.y >= 512) { if (blockDim.x >= 512) { if (i < 256) { sArray[ i ] += sArray[ i ]; sArray[ i ] += sArray[ i + 256 ]; } } if (blockDim.x * blockDim.y >= 256) { __syncthreads(); if (blockDim.x >= 256) { if (i < 128) { sArray[ i ] += sArray[ i + 128 ]; } } __syncthreads(); if (blockDim.x * blockDim.y >= 128) { if (blockDim.x >= 128) { if (i < 64) { sArray[ i ] += sArray[ i + 64 ]; } Loading @@ -360,12 +408,12 @@ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlock __syncthreads(); if (i < 32 ) { if( blockDim.x * blockDim.y >= 64 ) sArray[ i ] += sArray[ i + 32 ]; if( blockDim.x * blockDim.y >= 32 ) sArray[ i ] += sArray[ i + 16 ]; if( blockDim.x * blockDim.y >= 16 ) sArray[ i ] += sArray[ i + 8 ]; if( blockDim.x * blockDim.y >= 8 ) sArray[ i ] += sArray[ i + 4 ]; if( blockDim.x * blockDim.y >= 4 ) sArray[ i ] += sArray[ i + 2 ]; if( blockDim.x * blockDim.y >= 2 ) sArray[ i ] += sArray[ i + 1 ]; if( blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ]; if( blockDim.x >= 32 ) sArray[ i ] += sArray[ i + 16 ]; if( blockDim.x >= 16 ) sArray[ i ] += sArray[ i + 8 ]; if( blockDim.x >= 8 ) sArray[ i ] += sArray[ i + 4 ]; if( blockDim.x >= 4 ) sArray[ i ] += sArray[ i + 2 ]; if( blockDim.x >= 2 ) sArray[ i ] += sArray[ i + 1 ]; } if( i == 0 ) Loading @@ -378,14 +426,15 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, int *BlockIterDevice, int oddEvenBlock ) TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int i = thri + blockDim.x*blIdx; int j = blockDim.y*blIdy + thrj; int currentIndex = thrj * blockDim.x + thri; if( BlockIterDevice[ blIdy * gridDim.x + blIdx] ) { //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; __shared__ volatile bool changed[16*16]; changed[ currentIndex ] = false; Loading Loading @@ -424,12 +473,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*blockDim.y+1; BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; } __syncthreads(); if( (blIdy%2 + blIdx) % 2 == oddEvenBlock ) { if(thri == 0 && thrj == 0 ) BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; if( thri == 0 ) { Loading Loading @@ -528,14 +577,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; } if( changed[ 0 ] && thri == 0 && thrj == 0 ) if( changed[ 0 ] && thri == 0 && thrj == 0 ){ BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1; } __syncthreads(); } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; } /*if( thri == 0 && thrj == 0 ) printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/ } #endif src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -272,8 +272,8 @@ solve( const MeshPointer& mesh, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); Loading Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +7 −2 Original line number Diff line number Diff line Loading @@ -113,6 +113,8 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double template < typename T1 > __cuda_callable__ void sortMinims( T1 pom[] ); template < typename Index > void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY ); #ifdef HAVE_CUDA template < typename Real, typename Device, typename Index > Loading @@ -130,8 +132,11 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, int *BlockIterDevice, int oddEvenBlock); __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ); TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ); /*template < typename Real, typename Device, typename Index > __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/ Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +125 −74 Original line number Diff line number Diff line Loading @@ -235,13 +235,6 @@ solve( const MeshPointer& mesh, { // TODO: CUDA code #ifdef HAVE_CUDA Real *dAux; cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) ); const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); Loading @@ -250,18 +243,30 @@ solve( const MeshPointer& mesh, tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; //aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 ); //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter; BlockIter.setSize( numBlocksX * numBlocksY ); BlockIter.setValue( 0 ); /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) ); for( int i = 0; i < numBlocksX*numBlocksY +1; i++) BlockIter[i] = 1;*/ int *BlockIterDevice; int BlockIterD = 1; TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; BlockIterDevice.setSize( numBlocksX * numBlocksY ); BlockIterDevice.setValue( 1 ); /*int *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) ); cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/ int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); int oddEvenBlock = 0; TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; dBlock.setSize( nBlocks ); dBlock.setValue( 0 ); /*int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ while( BlockIterD ) { /*for( int i = 0; i < numBlocksX * numBlocksY; i++ ) Loading @@ -270,89 +275,132 @@ solve( const MeshPointer& mesh, CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice, oddEvenBlock ); BlockIterDevice ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice, oddEvenBlock ); BlockIter = BlockIterDevice; //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost); GetNeighbours( BlockIter, numBlocksX, numBlocksY ); BlockIterDevice = BlockIter; //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; CudaParallelReduc<<< nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } //aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 ); cudaFree( dAux ); cudaFree( BlockIterDevice ); /*cudaFree( BlockIterDevice ); cudaFree( dBlock ); delete BlockIter;*/ cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; //aux = *auxPtr; //interfaceMap = *interfaceMapPtr; aux = *auxPtr; interfaceMap = *interfaceMapPtr; #endif } iteration++; } aux.save("aux-final.tnl"); } #ifdef HAVE_CUDA /*template < typename Real, typename Device, typename Index > __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a ) template < typename Index > void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY ) { TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom; BlockIterPom.setSize( numBlockX * numBlockY ); BlockIterPom.setValue( 0 ); /*int* BlockIterPom; BlockIterPom = new int[numBlockX * numBlockY];*/ /*for(int i = 0; i < numBlockX * numBlockY; i++) BlockIterPom[ i ] = 0;*/ for(int i = 0; i < numBlockX * numBlockY; i++) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 ) if( BlockIter[ i ] ) { dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ]; // i = k*numBlockY + m; int m=0, k=0; m = i%numBlockY; k = i/numBlockY; if( k > 0 && numBlockY > 1 ) BlockIterPom[i - numBlockX] = 1; if( k < numBlockY-1 && numBlockY > 1 ) BlockIterPom[i + numBlockX] = 1; if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 ) BlockIterPom[ i+1 ] = 1; if( m <= numBlockX -1 && m > 0 && numBlockX > 1 ) BlockIterPom[ i-1 ] = 1; } } for(int i = 0; i < numBlockX * numBlockY; i++ ){ /// if( !BlockIter[ i ] ) BlockIter[ i ] = BlockIterPom[ i ]; /// else /// BlockIter[ i ] = 0; } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 ) /*for( int i = numBlockX-1; i > -1; i-- ) { aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ]; for( int j = 0; j< numBlockY; j++ ) std::cout << BlockIter[ i*numBlockY + j ]; std::cout << std::endl; } std::cout << std::endl;*/ //delete[] BlockIterPom; } }*/ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ) #ifdef HAVE_CUDA template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ) { int i = threadIdx.x; int blId = blockIdx.x; /*if ( i == 0 && blId == 0 ){ printf( "nBlocks = %d \n", nBlocks ); for( int j = nBlocks-1; j > -1 ; j--){ printf( "cislo = %d \n", BlockIterDevice[ j ] ); } }*/ __shared__ volatile int sArray[ 512 ]; sArray[ i ] = false; if(blId * 1024 + i < nBlocks ) sArray[ i ] = BlockIterDevice[ blId * 1024 + i ]; sArray[ i ] = 0; if( blId * 512 + i < nBlocks ) sArray[ i ] = BlockIterDevice[ blId * 512 + i ]; __syncthreads(); if (blockDim.x * blockDim.y == 1024) { if (blockDim.x == 1024) { if (i < 512) sArray[ i ] += sArray[ i ]; sArray[ i ] += sArray[ i + 512 ]; } __syncthreads(); if (blockDim.x * blockDim.y >= 512) { if (blockDim.x >= 512) { if (i < 256) { sArray[ i ] += sArray[ i ]; sArray[ i ] += sArray[ i + 256 ]; } } if (blockDim.x * blockDim.y >= 256) { __syncthreads(); if (blockDim.x >= 256) { if (i < 128) { sArray[ i ] += sArray[ i + 128 ]; } } __syncthreads(); if (blockDim.x * blockDim.y >= 128) { if (blockDim.x >= 128) { if (i < 64) { sArray[ i ] += sArray[ i + 64 ]; } Loading @@ -360,12 +408,12 @@ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlock __syncthreads(); if (i < 32 ) { if( blockDim.x * blockDim.y >= 64 ) sArray[ i ] += sArray[ i + 32 ]; if( blockDim.x * blockDim.y >= 32 ) sArray[ i ] += sArray[ i + 16 ]; if( blockDim.x * blockDim.y >= 16 ) sArray[ i ] += sArray[ i + 8 ]; if( blockDim.x * blockDim.y >= 8 ) sArray[ i ] += sArray[ i + 4 ]; if( blockDim.x * blockDim.y >= 4 ) sArray[ i ] += sArray[ i + 2 ]; if( blockDim.x * blockDim.y >= 2 ) sArray[ i ] += sArray[ i + 1 ]; if( blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ]; if( blockDim.x >= 32 ) sArray[ i ] += sArray[ i + 16 ]; if( blockDim.x >= 16 ) sArray[ i ] += sArray[ i + 8 ]; if( blockDim.x >= 8 ) sArray[ i ] += sArray[ i + 4 ]; if( blockDim.x >= 4 ) sArray[ i ] += sArray[ i + 2 ]; if( blockDim.x >= 2 ) sArray[ i ] += sArray[ i + 1 ]; } if( i == 0 ) Loading @@ -378,14 +426,15 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, int *BlockIterDevice, int oddEvenBlock ) TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int i = thri + blockDim.x*blIdx; int j = blockDim.y*blIdy + thrj; int currentIndex = thrj * blockDim.x + thri; if( BlockIterDevice[ blIdy * gridDim.x + blIdx] ) { //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; __shared__ volatile bool changed[16*16]; changed[ currentIndex ] = false; Loading Loading @@ -424,12 +473,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*blockDim.y+1; BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; } __syncthreads(); if( (blIdy%2 + blIdx) % 2 == oddEvenBlock ) { if(thri == 0 && thrj == 0 ) BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; if( thri == 0 ) { Loading Loading @@ -528,14 +577,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; } if( changed[ 0 ] && thri == 0 && thrj == 0 ) if( changed[ 0 ] && thri == 0 && thrj == 0 ){ BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1; } __syncthreads(); } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; } /*if( thri == 0 && thrj == 0 ) printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/ } #endif
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -272,8 +272,8 @@ solve( const MeshPointer& mesh, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); Loading