Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +12 −5 Original line number Diff line number Diff line Loading @@ -61,6 +61,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > typedef Index IndexType; typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >; using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; Loading @@ -76,6 +77,11 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > __cuda_callable__ bool updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy, const Real velocity = 1.0 ); void updateBlocks( InterfaceMapType interfaceMap, MeshFunctionType aux, ArrayContainer BlockIterHost, int numThreadsPerBlock ); void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ); }; template< typename Real, Loading Loading @@ -132,14 +138,15 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 ); template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ); /*template < typename Real, typename Device, typename Index > __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/ template < typename Index > __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Loading @@ -155,7 +162,7 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, int *BlockIterDevice ); TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); #endif #include "tnlDirectEikonalMethodsBase_impl.h" src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +193 −0 Original line number Diff line number Diff line Loading @@ -89,6 +89,199 @@ initInterface( const MeshFunctionPointer& _input, } } template< typename Real, typename Device, typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: updateBlocks( InterfaceMapType interfaceMap, MeshFunctionType aux, ArrayContainer BlockIterHost, int numThreadsPerBlock ) { for( int i = 0; i < BlockIterHost.getSize(); i++ ) { if( BlockIterHost[ i ] ) { MeshType mesh = interfaceMap.template getMesh< Devices::Host >(); int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); int numOfBlockx = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); int numOfBlocky = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); int xkolik = numThreadsPerBlock + 1; int ykolik = numThreadsPerBlock + 1; int blIdx = i%numOfBlockx; int blIdy = i/numOfBlocky; if( numOfBlockx - 1 == blIdx ) xkolik = dimX - (blIdx)*numThreadsPerBlock+1; if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*numThreadsPerBlock+1; /*bool changed[numThreadsPerBlock*numThreadsPerBlock]; changed[ 0 ] = 1;*/ Real hx = mesh.getSpaceSteps().x(); Real hy = mesh.getSpaceSteps().y(); Real changed1[ 16*16 ]; /*Real changed2[ 16*16 ]; Real changed3[ 16*16 ]; Real changed4[ 16*16 ];*/ Real sArray[18][18]; for( int thri = 0; thri < numThreadsPerBlock + 2; thri++ ) for( int thrj = 0; thrj < numThreadsPerBlock + 2; thrj++ ) sArray[thrj][thri] = std::numeric_limits< Real >::max(); BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ ) { if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik ) sArray[thrj+1][xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; else sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max(); if( blIdx != 0 && thrj+1 < ykolik ) sArray[thrj+1][0] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; else sArray[thrj+1][0] = std::numeric_limits< Real >::max(); if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik ) sArray[ykolik][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; else sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max(); if( blIdy != 0 && thrj+1 < xkolik ) sArray[0][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; else sArray[0][thrj+1] = std::numeric_limits< Real >::max(); } for( int k = 0; k < numThreadsPerBlock; k++ ) for( int l = 0; l < numThreadsPerBlock; l++ ) sArray[k+1][l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; for( int k = 0; k < numThreadsPerBlock; k++ ) for( int l = 0; l < numThreadsPerBlock; l++ ){ changed1[ k*numThreadsPerBlock + l ] = 0; /*changed2[ k*numThreadsPerBlock + l ] = 0; changed3[ k*numThreadsPerBlock + l ] = 0; changed4[ k*numThreadsPerBlock + l ] = 0;*/ if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) { if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { changed1[ k*numThreadsPerBlock + l ] = this->updateCell( sArray, l+1, k+1, hx,hy); } } } for( int k = numThreadsPerBlock-1; k > -1; k-- ) for( int l = 0; l < numThreadsPerBlock; l++ ) { if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) { if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { /*changed2[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy); } } } for( int k = 0; k < numThreadsPerBlock; k++ ) for( int l = numThreadsPerBlock-1; l >-1; l-- ) { if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) { if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { /*changed3[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy); } } } for( int k = numThreadsPerBlock-1; k > -1; k-- ) for( int l = numThreadsPerBlock-1; l >-1; l-- ) { if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) { if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { /*changed4[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy); } } } for( int k = numThreadsPerBlock-1; k > -1; k-- ) for( int l = numThreadsPerBlock-1; l >-1; l-- ){ changed1[ 0 ] = changed1[ 0 ] || changed1[ k*numThreadsPerBlock + l ]; /*changed2[ 0 ] = changed2[ 0 ] || changed2[ k*numThreadsPerBlock + l ]; changed3[ 0 ] = changed3[ 0 ] || changed3[ k*numThreadsPerBlock + l ]; changed4[ 0 ] = changed4[ 0 ] || changed4[ k*numThreadsPerBlock + l ];*/ } if( changed1[ 0 ] /*|| changed2[ 0 ] ||changed3[ 0 ] ||changed4[ 0 ]*/ ) BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1; for( int k = 0; k < numThreadsPerBlock; k++ ){ for( int l = 0; l < numThreadsPerBlock; l++ ) { if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY && (!interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]) ) aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ k + 1 ][ l + 1 ]; //std::cout<< sArray[k+1][l+1]; } //std::cout<<std::endl; } } } } template< typename Real, typename Device, typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ) { int* BlockIterPom; BlockIterPom = new int [numBlockX * numBlockY]; for(int i = 0; i < numBlockX * numBlockY; i++) { BlockIterPom[ i ] = 0; if( BlockIterHost[ i ] ) { // i = k*numBlockY + m; int m=0, k=0; m = i%numBlockX; k = i/numBlockX; if( k > 0 ) BlockIterPom[i - numBlockX] = 1; if( k < numBlockY - 1 ) BlockIterPom[i + numBlockX] = 1; if( m < numBlockX - 1 ) BlockIterPom[ i+1 ] = 1; if( m > 0 ) BlockIterPom[ i-1 ] = 1; } } for(int i = 0; i < numBlockX * numBlockY; i++ ) //if( !BlockIter[ i ] ) BlockIterHost[ i ] = BlockIterPom[ i ]; /*else BlockIter[ i ] = 0;*/ /*for( int i = numBlockX-1; i > -1; i-- ) { for( int j = 0; j< numBlockY; j++ ) std::cout << BlockIterHost[ i*numBlockY + j ]; std::cout << std::endl; } std::cout << std::endl;*/ delete[] BlockIterPom; } template< typename Real, typename Device, typename Index > Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h +2 −1 Original line number Diff line number Diff line Loading @@ -89,6 +89,7 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy > using typename BaseType::MeshFunctionType; using typename BaseType::InterfaceMapPointer; using typename BaseType::MeshFunctionPointer; using typename BaseType::ArrayContainer; FastSweepingMethod(); Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +398 −473 File changed.Preview size limit exceeded, changes collapsed. Show changes src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +24 −9 Original line number Diff line number Diff line Loading @@ -258,13 +258,21 @@ solve( const MeshPointer& mesh, tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; int *BlockIterDevice; int BlockIterD = 1; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) ); TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ ); BlockIterDevice.setValue( 1 ); /*int *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/ int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; dBlock.setSize( nBlocks ); dBlock.setValue( 0 ); /*int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ while( BlockIterD ) { Loading @@ -272,17 +280,24 @@ solve( const MeshPointer& mesh, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } cudaFree( BlockIterDevice ); cudaFree( dBlock ); //cudaFree( BlockIterDevice ); //cudaFree( dBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; Loading @@ -302,7 +317,7 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, int *BlockIterDevice ) TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z; Loading Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +12 −5 Original line number Diff line number Diff line Loading @@ -61,6 +61,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > typedef Index IndexType; typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >; using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; Loading @@ -76,6 +77,11 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > __cuda_callable__ bool updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy, const Real velocity = 1.0 ); void updateBlocks( InterfaceMapType interfaceMap, MeshFunctionType aux, ArrayContainer BlockIterHost, int numThreadsPerBlock ); void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ); }; template< typename Real, Loading Loading @@ -132,14 +138,15 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 ); template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ); /*template < typename Real, typename Device, typename Index > __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/ template < typename Index > __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Loading @@ -155,7 +162,7 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, int *BlockIterDevice ); TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); #endif #include "tnlDirectEikonalMethodsBase_impl.h"
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +193 −0 Original line number Diff line number Diff line Loading @@ -89,6 +89,199 @@ initInterface( const MeshFunctionPointer& _input, } } template< typename Real, typename Device, typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: updateBlocks( InterfaceMapType interfaceMap, MeshFunctionType aux, ArrayContainer BlockIterHost, int numThreadsPerBlock ) { for( int i = 0; i < BlockIterHost.getSize(); i++ ) { if( BlockIterHost[ i ] ) { MeshType mesh = interfaceMap.template getMesh< Devices::Host >(); int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); int numOfBlockx = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); int numOfBlocky = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); int xkolik = numThreadsPerBlock + 1; int ykolik = numThreadsPerBlock + 1; int blIdx = i%numOfBlockx; int blIdy = i/numOfBlocky; if( numOfBlockx - 1 == blIdx ) xkolik = dimX - (blIdx)*numThreadsPerBlock+1; if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*numThreadsPerBlock+1; /*bool changed[numThreadsPerBlock*numThreadsPerBlock]; changed[ 0 ] = 1;*/ Real hx = mesh.getSpaceSteps().x(); Real hy = mesh.getSpaceSteps().y(); Real changed1[ 16*16 ]; /*Real changed2[ 16*16 ]; Real changed3[ 16*16 ]; Real changed4[ 16*16 ];*/ Real sArray[18][18]; for( int thri = 0; thri < numThreadsPerBlock + 2; thri++ ) for( int thrj = 0; thrj < numThreadsPerBlock + 2; thrj++ ) sArray[thrj][thri] = std::numeric_limits< Real >::max(); BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ ) { if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik ) sArray[thrj+1][xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; else sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max(); if( blIdx != 0 && thrj+1 < ykolik ) sArray[thrj+1][0] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; else sArray[thrj+1][0] = std::numeric_limits< Real >::max(); if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik ) sArray[ykolik][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; else sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max(); if( blIdy != 0 && thrj+1 < xkolik ) sArray[0][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; else sArray[0][thrj+1] = std::numeric_limits< Real >::max(); } for( int k = 0; k < numThreadsPerBlock; k++ ) for( int l = 0; l < numThreadsPerBlock; l++ ) sArray[k+1][l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; for( int k = 0; k < numThreadsPerBlock; k++ ) for( int l = 0; l < numThreadsPerBlock; l++ ){ changed1[ k*numThreadsPerBlock + l ] = 0; /*changed2[ k*numThreadsPerBlock + l ] = 0; changed3[ k*numThreadsPerBlock + l ] = 0; changed4[ k*numThreadsPerBlock + l ] = 0;*/ if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) { if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { changed1[ k*numThreadsPerBlock + l ] = this->updateCell( sArray, l+1, k+1, hx,hy); } } } for( int k = numThreadsPerBlock-1; k > -1; k-- ) for( int l = 0; l < numThreadsPerBlock; l++ ) { if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) { if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { /*changed2[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy); } } } for( int k = 0; k < numThreadsPerBlock; k++ ) for( int l = numThreadsPerBlock-1; l >-1; l-- ) { if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) { if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { /*changed3[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy); } } } for( int k = numThreadsPerBlock-1; k > -1; k-- ) for( int l = numThreadsPerBlock-1; l >-1; l-- ) { if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) { if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { /*changed4[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy); } } } for( int k = numThreadsPerBlock-1; k > -1; k-- ) for( int l = numThreadsPerBlock-1; l >-1; l-- ){ changed1[ 0 ] = changed1[ 0 ] || changed1[ k*numThreadsPerBlock + l ]; /*changed2[ 0 ] = changed2[ 0 ] || changed2[ k*numThreadsPerBlock + l ]; changed3[ 0 ] = changed3[ 0 ] || changed3[ k*numThreadsPerBlock + l ]; changed4[ 0 ] = changed4[ 0 ] || changed4[ k*numThreadsPerBlock + l ];*/ } if( changed1[ 0 ] /*|| changed2[ 0 ] ||changed3[ 0 ] ||changed4[ 0 ]*/ ) BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1; for( int k = 0; k < numThreadsPerBlock; k++ ){ for( int l = 0; l < numThreadsPerBlock; l++ ) { if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY && (!interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]) ) aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ k + 1 ][ l + 1 ]; //std::cout<< sArray[k+1][l+1]; } //std::cout<<std::endl; } } } } template< typename Real, typename Device, typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ) { int* BlockIterPom; BlockIterPom = new int [numBlockX * numBlockY]; for(int i = 0; i < numBlockX * numBlockY; i++) { BlockIterPom[ i ] = 0; if( BlockIterHost[ i ] ) { // i = k*numBlockY + m; int m=0, k=0; m = i%numBlockX; k = i/numBlockX; if( k > 0 ) BlockIterPom[i - numBlockX] = 1; if( k < numBlockY - 1 ) BlockIterPom[i + numBlockX] = 1; if( m < numBlockX - 1 ) BlockIterPom[ i+1 ] = 1; if( m > 0 ) BlockIterPom[ i-1 ] = 1; } } for(int i = 0; i < numBlockX * numBlockY; i++ ) //if( !BlockIter[ i ] ) BlockIterHost[ i ] = BlockIterPom[ i ]; /*else BlockIter[ i ] = 0;*/ /*for( int i = numBlockX-1; i > -1; i-- ) { for( int j = 0; j< numBlockY; j++ ) std::cout << BlockIterHost[ i*numBlockY + j ]; std::cout << std::endl; } std::cout << std::endl;*/ delete[] BlockIterPom; } template< typename Real, typename Device, typename Index > Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h +2 −1 Original line number Diff line number Diff line Loading @@ -89,6 +89,7 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy > using typename BaseType::MeshFunctionType; using typename BaseType::InterfaceMapPointer; using typename BaseType::MeshFunctionPointer; using typename BaseType::ArrayContainer; FastSweepingMethod(); Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +398 −473 File changed.Preview size limit exceeded, changes collapsed. Show changes
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +24 −9 Original line number Diff line number Diff line Loading @@ -258,13 +258,21 @@ solve( const MeshPointer& mesh, tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; int *BlockIterDevice; int BlockIterD = 1; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) ); TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ ); BlockIterDevice.setValue( 1 ); /*int *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/ int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; dBlock.setSize( nBlocks ); dBlock.setValue( 0 ); /*int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ while( BlockIterD ) { Loading @@ -272,17 +280,24 @@ solve( const MeshPointer& mesh, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } cudaFree( BlockIterDevice ); cudaFree( dBlock ); //cudaFree( BlockIterDevice ); //cudaFree( dBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; Loading @@ -302,7 +317,7 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, int *BlockIterDevice ) TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z; Loading