diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h index 50ea7bde8ee87b24317363feb78e30d45128fa3b..c470a77efdbbb7fe8fc91b584ae8af2440a2eb59 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h @@ -353,8 +353,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, template < typename Index > -__global__ void GetNeighbours( const TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator, - TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY ) +__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY ) { int i = blockIdx.x * 1024 + threadIdx.x; @@ -389,7 +389,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, - TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator, const Containers::StaticVector< 2, Index > vecLowerOverlaps, const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock ) { @@ -598,7 +598,7 @@ tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: updateBlocks( InterfaceMapType interfaceMap, MeshFunctionType aux, MeshFunctionType helpFunc, - ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) + ArrayContainerView BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) { #pragma omp parallel for schedule( dynamic ) for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) @@ -769,7 +769,7 @@ template< typename Real, typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ) +getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY ) { int* BlockIterPom; BlockIterPom = new int [numBlockX * numBlockY]; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h index 5b2a4b685b75a4bb616631b139bca7f1215b4c0b..32548abcfe66affd71a79d3ed5f2f21d67df644e 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h @@ -480,8 +480,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3 template < typename Index > -__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, +__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY, int numBlockZ ) { int i = blockIdx.x * 1024 + threadIdx.x; @@ -520,7 +520,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, - TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps ) { int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; @@ -1056,7 +1056,7 @@ template< typename Real, typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ) +getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ) { int* BlockIterPom; BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ]; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index e0ece04bf01d4a6e3356f716aa13b471f9933f73..7cba99f6586b84e31fbe1a49da4223c9e621dd5b 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -62,6 +62,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; + using ArrayContainerView = typename ArrayContainer::ViewType; typedef Containers::StaticVector< 2, Index > StaticVector; using MeshPointer = Pointers::SharedPointer< MeshType >; @@ -87,15 +88,18 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > const RealType velocity = 1.0 ); // FOR OPENMP WILL BE REMOVED - void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ); + void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY ); template< int sizeSArray > - void updateBlocks( const InterfaceMapType& interfaceMap, - MeshFunctionType& aux, - MeshFunctionType& helpFunc, - ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ); + void updateBlocks( InterfaceMapType interfaceMap, + MeshFunctionType aux, + MeshFunctionType helpFunc, + ArrayContainerView BlockIterHost, int numThreadsPerBlock ); + + protected: - void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY ); + __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[], + const RealType originalValue, const RealType v ); }; template< typename Real, @@ -111,6 +115,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; + using ArrayContainerView = typename ArrayContainer::ViewType; typedef Containers::StaticVector< 3, Index > StaticVector; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >; using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; @@ -134,15 +139,15 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > const RealType velocity = 1.0 ); // OPENMP WILL BE REMOVED - void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ); + void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ); template< int sizeSArray > - void updateBlocks( const InterfaceMapType& interfaceMap, - const MeshFunctionType& aux, + void updateBlocks( const InterfaceMapType interfaceMap, + const MeshFunctionType aux, MeshFunctionType& helpFunc, - ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ); + ArrayContainer BlockIterHost, int numThreadsPerBlock ); - void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ); + protected: __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[], const RealType originalValue, const RealType v ); @@ -180,17 +185,14 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, - TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator, const Containers::StaticVector< 2, Index > vecLowerOverlaps, const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock =0); template < typename Index > -__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks ); +__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY ); -template < typename Index > -__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ); // 3D @@ -205,10 +207,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice ); + TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, + Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps ); template < typename Index > -__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, +__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY, int numBlockZ ); #endif diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 31d3f8b324a7fc2a02d94e54b5909190cacd965d..e5638c11dd71d88d72d8d0590c8e51c0df6baaab 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -80,40 +80,9 @@ solve( const MeshPointer& mesh, IndexType iteration( 0 ); InterfaceMapType interfaceMap = *interfaceMapPtr; MeshFunctionType aux = *auxPtr; - aux.template synchronize< Communicator >(); - - -#ifdef HAVE_MPI - int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup ); - //printf( "Hello world from rank: %d ", i ); - //Communicators::MpiCommunicator::Request r = Communicators::MpiCommunicator::ISend( auxPtr, 0, 0, Communicators::MpiCommunicator::AllGroup ); - if( i == 1 ) { - /*for( int k = 0; k < 16*16; k++ ) - aux[ k ] = 10;*/ - printf( "1: mesh x: %d\n", mesh->getDimensions().x() ); - printf( "1: mesh y: %d\n", mesh->getDimensions().y() ); - //aux.save("aux_proc1.tnl"); - } - if( i == 0 ) { - printf( "0: mesh x: %d\n", mesh->getDimensions().x() ); - printf( "0: mesh y: %d\n", mesh->getDimensions().y() ); - //aux.save("aux_proc0.tnl"); - /*for( int k = 0; k < mesh->getDimensions().x()*mesh->getDimensions().y(); k++ ) - aux[ k ] = 10; - for( int k = 0; k < mesh->getDimensions().x(); k++ ){ - for( int l = 0; l < mesh->getDimensions().y(); l++ ) - printf("%f.2\t",aux[ k * 16 + l ] ); - printf("\n"); - }*/ - } - - /*bool a = Communicators::MpiCommunicator::IsInitialized(); - if( a ) - printf("Je Init\n"); - else - printf("Neni Init\n");*/ -#endif + aux.template synchronize< Communicator >(); //synchronize initialized overlaps + std::cout << "Calculating the values ..." << std::endl; while( iteration < this->maxIterations ) { // calculatedBefore indicates weather we calculated in the last passage of the while cycle @@ -290,41 +259,8 @@ solve( const MeshPointer& mesh, // Need for calling functions from kernel BaseType ptr; - int BlockIterD = 1; - /*auxPtr = helpFunc; - - CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice, - oddEvenBlock.getView() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - auxPtr = helpFunc; - - oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - - CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice, - oddEvenBlock.getView() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - auxPtr = helpFunc; - - oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - - CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - BlockIterD = dBlock.getElement( 0 );*/ + // True if we should calculate again. + int calculateCudaBlocksAgain = 1; // Array that identifies which blocks should be calculated. // All blocks should calculate in first passage ( setValue(1) ) @@ -343,16 +279,9 @@ solve( const MeshPointer& mesh, MeshFunctionPointer helpFunc( mesh ); helpFunc.template modifyData() = auxPtr.template getData(); - //int pocBloku = 0; - Devices::Cuda::synchronizeDevice(); - CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice.getView() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - + // number of iterations of while calculateCudaBlocksAgain + int numIter = 0; + //int oddEvenBlock = 0; while( calculateCudaBlocksAgain ) { @@ -390,44 +319,16 @@ solve( const MeshPointer& mesh, Devices::Cuda::synchronizeDevice(); CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), - blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps ); + blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - - aux = *auxPtr; - interfaceMap = *interfaceMapPtr; -#endif - } - - -/**----------------------MPI-TO-DO---------------------------------------------**/ - -#ifdef HAVE_MPI - //int i = MPI::GetRank( MPI::AllGroup ); - //TNL::Meshes::DistributedMeshes::DistributedMesh< MeshType > Mesh; - int neighCount = 0; // should this thread calculate again? - int calculpom[4] = {0,0,0,0}; - - if( i == 0 ){ - BlockIterPom1 = BlockIterDevice; - for( int i =0; i< numBlocksX; i++ ){ - for( int j = 0; j < numBlocksY; j++ ) - { - std::cout << BlockIterPom1[j*numBlocksX + i]; - } - std::cout << std::endl; - } - std::cout << std::endl; - } -#endif + + // Switching helpFunc and auxPtr. + auxPtr.swap( helpFunc ); // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. Devices::Cuda::synchronizeDevice(); - GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator, blockCalculationIndicatorHelp, numBlocksX, numBlocksY ); + GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; blockCalculationIndicator = blockCalculationIndicatorHelp; @@ -445,46 +346,24 @@ solve( const MeshPointer& mesh, /**-----------------------------------------------------------------------------------------------------------*/ numIter ++; } - if( numIter%2 == 1 ){ - auxPtr = helpFunc; - } - /*cudaFree( BlockIterDevice ); - cudaFree( dBlock ); - delete BlockIter;*/ - - if( neigh[1] != -1 ) - { - req[neighCount] = MPI::ISend( &calculated, 1, neigh[1], 0, MPI::AllGroup ); - neighCount++; - - - req[neighCount] = MPI::IRecv( &calculpom[1], 1, neigh[1], 0, MPI::AllGroup ); - neighCount++; - } - - if( neigh[2] != -1 ) - { - req[neighCount] = MPI::ISend( &calculated, 1, neigh[2], 0, MPI::AllGroup ); - neighCount++; - - req[neighCount] = MPI::IRecv( &calculpom[2], 1, neigh[2], 0, MPI::AllGroup ); - neighCount++; - } - - if( neigh[5] != -1 ) + if( numIter%2 == 1 ) // Need to check parity for MPI overlaps to synchronize ( otherwise doesnt work ) { - req[neighCount] = MPI::ISend( &calculated, 1, neigh[5], 0, MPI::AllGroup ); - neighCount++; - - req[neighCount] = MPI::IRecv( &calculpom[3], 1, neigh[5], 0, MPI::AllGroup ); - neighCount++; + helpFunc.swap( auxPtr ); + Devices::Cuda::synchronizeDevice(); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; } - - MPI::WaitAll(req,neighCount); -#if ForDebug - printf( "%d: Sending Calculated = %d.\n", i, calculated ); -#endif - MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR, MPI::AllGroup ); + aux = *auxPtr; + interfaceMap = *interfaceMapPtr; +#endif + } + + +/**----------------------MPI-TO-DO---------------------------------------------**/ +#ifdef HAVE_MPI + if( CommunicatorType::isDistributed() ){ + getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh ); + aux.template synchronize< Communicator >(); } #endif @@ -518,9 +397,16 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, #endif } -template < typename Index > -__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ) + + + +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +bool +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: +goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, + MeshFunctionType& aux, const InterfaceMapType& interfaceMap, + const AnisotropyPointer& anisotropy ) { bool calculated = false; const MeshType& mesh = aux.getMesh(); @@ -548,97 +434,15 @@ __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, I return calculated; } -template < typename Index > -__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks ) -{ - int i = threadIdx.x; - int blId = blockIdx.x; - int blockSize = blockDim.x; - /*if ( i == 0 && blId == 0 ){ - printf( "nBlocks = %d\n", nBlocks ); - for( int j = nBlocks-1; j > -1 ; j--){ - printf( "%d: cislo = %d \n", j, BlockIterDevice[ j ] ); - } - }*/ - __shared__ int sArray[ 1024 ]; - sArray[ i ] = 0; - if( blId * 1024 + i < nBlocks ) - sArray[ i ] = BlockIterDevice[ blId * 1024 + i ]; - __syncthreads(); - /*if ( i == 0 && blId == 0 ){ - printf( "nBlocks = %d\n", nBlocks ); - for( int j = 4; j > -1 ; j--){ - printf( "%d: cislo = %d \n", j, sArray[ j ] ); - } - }*/ - /*extern __shared__ volatile int sArray[]; - unsigned int i = threadIdx.x; - unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x; - unsigned int gridSize = blockSize * 2 * gridDim.x; - sArray[ i ] = 0; - while( gid < nBlocks ) - { - sArray[ i ] += BlockIterDevice[ gid ] + BlockIterDevice[ gid + blockSize ]; - gid += gridSize; - } - __syncthreads();*/ - - if ( blockSize == 1024) { - if (i < 512) - sArray[ i ] += sArray[ i + 512 ]; - } - __syncthreads(); - if (blockSize >= 512) { - if (i < 256) { - sArray[ i ] += sArray[ i + 256 ]; - } - } - __syncthreads(); - if (blockSize >= 256) { - if (i < 128) { - sArray[ i ] += sArray[ i + 128 ]; - } - } - __syncthreads(); - if (blockSize >= 128) { - if (i < 64) { - sArray[ i ] += sArray[ i + 64 ]; - } - } - __syncthreads(); - if (i < 32 ) - { - if( blockSize >= 64 ){ sArray[ i ] += sArray[ i + 32 ];} - __syncthreads(); - if( blockSize >= 32 ){ sArray[ i ] += sArray[ i + 16 ];} - __syncthreads(); - if( blockSize >= 16 ){ sArray[ i ] += sArray[ i + 8 ];} - if( blockSize >= 8 ){ sArray[ i ] += sArray[ i + 4 ];} - __syncthreads(); - if( blockSize >= 4 ){ sArray[ i ] += sArray[ i + 2 ];} - __syncthreads(); - if( blockSize >= 2 ){ sArray[ i ] += sArray[ i + 1 ];} - __syncthreads(); - } - __syncthreads(); - - if( i == 0 ) - dBlock[ blId ] = sArray[ 0 ]; -} -template < int sizeSArray, typename Real, typename Device, typename Index > -__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, - const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, - const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, - CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) ); - TNL_CHECK_CUDA_DEVICE; - - CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); - TNL_CHECK_CUDA_DEVICE; +#ifdef HAVE_MPI +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +void +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: +getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh ) { Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); @@ -687,4 +491,3 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< calculateFromNeighbours[2] || calculateFromNeighbours[3]; } #endif - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock ) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 4895c7693772a9f33458a119506668a493996ec0..325b626f7bf5262637f8e1b43ec9e156bbeca26b 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -262,24 +262,86 @@ solve( const MeshPointer& mesh, // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2) const int cudaBlockSize( 8 ); - CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice.getView() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps) + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); + int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); + if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) + std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; - GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - BlockIterDevice = BlockIterPom; + // Making the variables for global function CudaUpdateCellCaller. + dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY * numBlocksZ ) ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + BaseType ptr; // tnlDirectEikonalMethodBase type for calling of function inside CudaUpdateCellCaller + + + int BlockIterD = 1; //variable that tells us weather we should calculate the main cuda body again + + // Array containing information about each block in grid, answering question (Have we calculated in this block?) + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice( numBlocksX * numBlocksY * numBlocksZ ); + BlockIterDevice.setValue( 1 ); // calculate all in the first passage + + // Helping Array for GetNeighbours3D + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom( numBlocksX * numBlocksY * numBlocksZ ); + BlockIterPom.setValue( 0 ); //doesnt matter what number + + + + // number of neighbours in one block (1024 threads) for GetNeighbours3D + int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0); + + + //MeshFunctionPointer helpFunc1( mesh ); + MeshFunctionPointer helpFunc( mesh ); + helpFunc.template modifyData() = auxPtr.template getData(); + Devices::Cuda::synchronizeDevice(); + + int numIter = 0; // number of passages of following while cycle - CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); + while( BlockIterD ) //main body of cuda code + { + + Devices::Cuda::synchronizeDevice(); + // main function that calculates all values in each blocks + // calculated values are in helpFunc + CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), + BlockIterDevice.getView(), vecLowerOverlaps, vecUpperOverlaps ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + // Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr + auxPtr.swap( helpFunc ); + + Devices::Cuda::synchronizeDevice(); + // Neighbours of blocks that calculatedBefore in this passage should calculate in the next! + // BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours) + GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + BlockIterDevice = BlockIterPom; + Devices::Cuda::synchronizeDevice(); + + // .containsValue(1) is actually parallel reduction implemented in TNL + BlockIterD = BlockIterDevice.containsValue(1); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + numIter++; + if( BlockIterD ){ + // if we calculated in this passage, we should send the info via MPI so neighbours should calculate after synchronization + calculatedBefore = 1; + } + } + if( numIter%2 == 1 ){ + + // We need auxPtr to point on memory of original auxPtr (not to helpFunc) + // last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values + auxPtr.swap( helpFunc ); + Devices::Cuda::synchronizeDevice(); + } cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; @@ -375,10 +437,15 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, return calculated; } -template < typename Index > -__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, - int numBlockX, int numBlockY, int numBlockZ ) + + + +#ifdef HAVE_MPI +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +void +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: +getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh ) { Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); @@ -397,22 +464,6 @@ __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup ); } -} - -template < int sizeSArray, typename Real, typename Device, typename Index > -__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, - const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, - const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice ) -{ - int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; - int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z; - int i = threadIdx.x + blockDim.x*blockIdx.x + vLower[0]; // WITH OVERLAPS!!! i,j,k aren't coordinates of all values - int j = blockDim.y*blockIdx.y + threadIdx.y + vLower[1]; - int k = blockDim.z*blockIdx.z + threadIdx.z + vLower[2]; - int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; - const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); if( neighbours[1] != -1 ) // EAST {