diff --git a/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h index e3f47292306bbb879bf2677223f2f4b63cc7513c..3f0c9194867d011724e2387c882420e04e798ee6 100644 --- a/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h +++ b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h @@ -490,7 +490,7 @@ getExplicitUpdate( const RealType& time, //std::cerr << "Setting boundary conditions..." << std::endl; - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ ) for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) boundaryConditionsTemplatedCompact< MeshType, CellType, BoundaryCondition, MeshFunctionType > @@ -594,7 +594,7 @@ getExplicitUpdate( const RealType& time, gridYSize / 16 + ( gridYSize % 16 != 0 ) ); */ - TNL::Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); int cudaErr; Meshes::Traverser< MeshType, Cell > meshTraverser; meshTraverser.template processInteriorEntities< UserData, diff --git a/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h index 2a77f8bb5287a75cee1b38356c96dd49dbff5250..c9fe0e43be20b175321dc1b50c563fb48e843b5d 100644 --- a/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h +++ b/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h @@ -246,7 +246,7 @@ processEntities( IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 ); Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); dim3 gridIdx, cudaGridSize; - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) { Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); @@ -273,7 +273,7 @@ processEntities( auto& pool = Cuda::StreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); dim3 gridIdx, cudaGridSize; for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h index 6661c5f6a8e720df9a265ca5cb3c5b97717294d5..ffb2f121a2b68719e6e07c29ce404384aa715b23 100644 --- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h +++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h @@ -130,7 +130,7 @@ benchmarkIterativeSolvers( Benchmark& benchmark, *cudaMatrixPointer = *matrixPointer; // synchronize shared pointers - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); #endif using namespace Solvers::Linear; diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h index 853cd2e038c669485781a1462cdb4a54a9670e35..fc924ccc9f054cd5b55c0e21ab836dbfa2b52687 100644 --- a/src/TNL/Devices/Cuda.h +++ b/src/TNL/Devices/Cuda.h @@ -10,13 +10,7 @@ #pragma once -#include <iostream> - #include <TNL/String.h> -#include <TNL/Assert.h> -#include <TNL/Pointers/SmartPointersRegister.h> -#include <TNL/Timer.h> -#include <TNL/Cuda/CudaCallable.h> #include <TNL/Config/ConfigDescription.h> #include <TNL/Config/ParameterContainer.h> @@ -33,16 +27,6 @@ public: static inline constexpr int getGPUTransferBufferSize(); - static inline void insertSmartPointer( Pointers::SmartPointer* pointer ); - - static inline void removeSmartPointer( Pointers::SmartPointer* pointer ); - - // Negative deviceId means that CudaDeviceInfo::getActiveDevice will be - // called to get the device ID. - static inline bool synchronizeDevice( int deviceId = -1 ); - - static inline Timer& getSmartPointersSynchronizationTimer(); - //// // When we transfer data between the GPU and the CPU we use 5 MB buffer. This // size should ensure good performance -- see. @@ -50,11 +34,6 @@ public: // We use the same buffer size even for retyping data during IO operations. // static constexpr std::size_t TransferBufferSize = 5 * 2<<20; - - - protected: - - static inline Pointers::SmartPointersRegister& getSmartPointersRegister(); }; #ifdef HAVE_CUDA diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h index 6d3daa356cb64f2373451a260a1403aabff83be6..7a4d59fcc0a87d224867f9eab930cc6bbf2c06d1 100644 --- a/src/TNL/Devices/Cuda_impl.h +++ b/src/TNL/Devices/Cuda_impl.h @@ -10,12 +10,15 @@ #pragma once +#include <iostream> + #include <TNL/Math.h> #include <TNL/Devices/Cuda.h> #include <TNL/Cuda/DeviceInfo.h> #include <TNL/Exceptions/CudaBadAlloc.h> #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Exceptions/CudaRuntimeError.h> +#include <TNL/Pointers/SmartPointersRegister.h> namespace TNL { namespace Devices { @@ -42,8 +45,8 @@ Cuda::setup( const Config::ParameterContainer& parameters, std::cerr << "I cannot activate CUDA device number " << cudaDevice << "." << std::endl; return false; } - getSmartPointersSynchronizationTimer().reset(); - getSmartPointersSynchronizationTimer().stop(); + Pointers::getSmartPointersSynchronizationTimer< Devices::Cuda >().reset(); + Pointers::getSmartPointersSynchronizationTimer< Devices::Cuda >().stop(); #endif return true; } @@ -53,42 +56,6 @@ inline constexpr int Cuda::getGPUTransferBufferSize() return 1 << 20; } -inline void Cuda::insertSmartPointer( Pointers::SmartPointer* pointer ) -{ - getSmartPointersRegister().insert( pointer, TNL::Cuda::DeviceInfo::getActiveDevice() ); -} - -inline void Cuda::removeSmartPointer( Pointers::SmartPointer* pointer ) -{ - getSmartPointersRegister().remove( pointer, TNL::Cuda::DeviceInfo::getActiveDevice() ); -} - -inline bool Cuda::synchronizeDevice( int deviceId ) -{ -#ifdef HAVE_CUDA - if( deviceId < 0 ) - deviceId = TNL::Cuda::DeviceInfo::getActiveDevice(); - getSmartPointersSynchronizationTimer().start(); - bool b = getSmartPointersRegister().synchronizeDevice( deviceId ); - getSmartPointersSynchronizationTimer().stop(); - return b; -#else - return true; -#endif -} - -inline Timer& Cuda::getSmartPointersSynchronizationTimer() -{ - static Timer timer; - return timer; -} - -inline Pointers::SmartPointersRegister& Cuda::getSmartPointersRegister() -{ - static Pointers::SmartPointersRegister reg; - return reg; -} - // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h index 4dac64a23fc50d85186e2a1f118f818807d0fc87..49cda643cc73a6717f47d877315b980e5589d60c 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h @@ -25,7 +25,7 @@ initInterface( const MeshFunctionPointer& _input, int numBlocksX = Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); dim3 blockSize( cudaBlockSize ); dim3 gridSize( numBlocksX ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), _output.template modifyData< Device >(), _interfaceMap.template modifyData< Device >() ); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h index 947a4be062a9b8763ec9a6042c41d3048aa8a361..b18252cb078b803d6e911ca130b9b161c241ff4d 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h @@ -29,7 +29,7 @@ initInterface( const MeshFunctionPointer& _input, int numBlocksY = Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); dim3 blockSize( cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), _output.template modifyData< Device >(), _interfaceMap.template modifyData< Device >(), diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h index eb0665c7e4a66a8424af4a312d68f0f3a039a934..fd7dc9381ec1325f108078271df343953be58ebd 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h @@ -30,7 +30,7 @@ initInterface( const MeshFunctionPointer& _input, std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(), _output.template modifyData< Device >(), _interfaceMap.template modifyData< Device >(), vLower, vUpper ); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index c5a0f74cc53f89808f5308aa628c6c55802eb200..1b1666a02b627778bd8364a7e46f5e22718ff76f 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -316,7 +316,7 @@ solve( const MeshPointer& mesh, /** HERE IS FIM FOR MPI AND WITHOUT MPI **/ - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps ); @@ -327,7 +327,7 @@ solve( const MeshPointer& mesh, auxPtr.swap( helpFunc ); // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; @@ -349,7 +349,7 @@ solve( const MeshPointer& mesh, if( numIter%2 == 1 ) // Need to check parity for MPI overlaps to synchronize ( otherwise doesnt work ) { helpFunc.swap( auxPtr ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 3fce5564efdf65ebaf45c62eaed8cae259d1c4b3..82185a937d832b0785b597188aeb0989ab751d47 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -295,14 +295,14 @@ solve( const MeshPointer& mesh, //MeshFunctionPointer helpFunc1( mesh ); MeshFunctionPointer helpFunc( mesh ); helpFunc.template modifyData() = auxPtr.template getData(); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); int numIter = 0; // number of passages of following while cycle while( BlockIterD ) //main body of cuda code { - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); // main function that calculates all values in each blocks // calculated values are in helpFunc CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr, @@ -315,14 +315,14 @@ solve( const MeshPointer& mesh, // Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr auxPtr.swap( helpFunc ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); // Neighbours of blocks that calculatedBefore in this passage should calculate in the next! // BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours) GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; BlockIterDevice = BlockIterPom; - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); // .containsValue(1) is actually parallel reduction implemented in TNL BlockIterD = BlockIterDevice.containsValue(1); @@ -340,7 +340,7 @@ solve( const MeshPointer& mesh, // We need auxPtr to point on memory of original auxPtr (not to helpFunc) // last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values auxPtr.swap( helpFunc ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); } cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; diff --git a/src/TNL/Matrices/SparseOperations_impl.h b/src/TNL/Matrices/SparseOperations_impl.h index b6d118bf22e39c9ee345269c2a2cc28760b3a198..ce7caaf3297dc869b6b0ae969d567c939e5b9ebb 100644 --- a/src/TNL/Matrices/SparseOperations_impl.h +++ b/src/TNL/Matrices/SparseOperations_impl.h @@ -140,7 +140,7 @@ copySparseMatrix_impl( Matrix1& A, const Matrix2& B ) const Pointers::DevicePointer< const Matrix2 > Bpointer( B ); // set row lengths - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>( rowLengths.getData(), &Bpointer.template getData< TNL::Devices::Cuda >(), @@ -150,7 +150,7 @@ copySparseMatrix_impl( Matrix1& A, const Matrix2& B ) Apointer->setCompressedRowLengths( rowLengths ); // copy rows - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); SparseMatrixCopyKernel<<< gridSize, blockSize >>>( &Apointer.template modifyData< TNL::Devices::Cuda >(), &Bpointer.template getData< TNL::Devices::Cuda >(), diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp index 796ffe491d572c108f7b8072769e42e9d5c9571d..c1aab9660d50ee8fe6917ae08c6bb869e333c056 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp @@ -185,7 +185,7 @@ processEntities( auto& pool = Cuda::StreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); if( processOnlyBoundaryEntities ) { dim3 cudaBlockSize( 2 ); diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp index 15c5a0eda13a10bd512fe69930bed1e9c90f068d..721ec96d2331c103cb0179e5bd77b224b700c28f 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp @@ -418,8 +418,8 @@ processEntities( Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); auto& pool = Cuda::StreamPool::getInstance(); - Devices::Cuda::synchronizeDevice(); - + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); + const cudaStream_t& s1 = pool.getStream( stream ); const cudaStream_t& s2 = pool.getStream( stream + 1 ); dim3 gridIdx, cudaGridSize; @@ -486,7 +486,7 @@ processEntities( //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount // << "cudaBlockCount = " << cudaBlocksCount.x << std::endl; dim3 gridIdx, cudaGridSize; - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) { Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); @@ -518,7 +518,7 @@ processEntities( auto& pool = Cuda::StreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); dim3 gridIdx, cudaGridSize; for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp index 489e58d779b57e7b5afea4b2d99516fb23b04f31..a9aad8c9533dfecdc6e5410be51705d24438725c 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp @@ -346,8 +346,8 @@ processEntities( Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 ); auto& pool = Cuda::StreamPool::getInstance(); - Devices::Cuda::synchronizeDevice(); - + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); + const cudaStream_t& s1 = pool.getStream( stream ); const cudaStream_t& s2 = pool.getStream( stream + 1 ); const cudaStream_t& s3 = pool.getStream( stream + 2 ); @@ -458,7 +458,7 @@ processEntities( auto& pool = Cuda::StreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); dim3 gridIdx, gridSize; for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ ) for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) diff --git a/src/TNL/Meshes/MeshDetails/Traverser_impl.h b/src/TNL/Meshes/MeshDetails/Traverser_impl.h index 27cbba714160986d156c4c5e828274df53fb1bf3..33832d4f10944d51c5bb01646d2a239c86365f7f 100644 --- a/src/TNL/Meshes/MeshDetails/Traverser_impl.h +++ b/src/TNL/Meshes/MeshDetails/Traverser_impl.h @@ -164,7 +164,7 @@ processBoundaryEntities( const MeshPointer& meshPointer, const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() ); gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); MeshTraverserBoundaryEntitiesKernel< EntitiesDimension, EntitiesProcessor > <<< gridSize, blockSize >>> ( &meshPointer.template getData< Devices::Cuda >(), @@ -195,7 +195,7 @@ processInteriorEntities( const MeshPointer& meshPointer, const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() ); gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); MeshTraverserInteriorEntitiesKernel< EntitiesDimension, EntitiesProcessor > <<< gridSize, blockSize >>> ( &meshPointer.template getData< Devices::Cuda >(), @@ -226,7 +226,7 @@ processAllEntities( const MeshPointer& meshPointer, const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() ); gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); MeshTraverserAllEntitiesKernel< EntitiesDimension, EntitiesProcessor > <<< gridSize, blockSize >>> ( &meshPointer.template getData< Devices::Cuda >(), diff --git a/src/TNL/Pointers/DevicePointer.h b/src/TNL/Pointers/DevicePointer.h index d3bf9b07f9b2c59030433e1bca633a2601bb73d0..5276c3ed465938e7e7fcdfde2885dc8986cac3b5 100644 --- a/src/TNL/Pointers/DevicePointer.h +++ b/src/TNL/Pointers/DevicePointer.h @@ -16,6 +16,7 @@ #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Pointers/SmartPointer.h> +#include <TNL/Pointers/SmartPointersRegister.h> #include <TNL/TypeInfo.h> #include <TNL/Cuda/MemoryHelpers.h> @@ -406,7 +407,7 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer ~DevicePointer() { this->free(); - Devices::Cuda::removeSmartPointer( this ); + getSmartPointersRegister< DeviceType >().remove( this ); } protected: @@ -426,7 +427,7 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer this->cuda_pointer = Cuda::passToDevice( *this->pointer ); // set last-sync state this->set_last_sync_state(); - Devices::Cuda::insertSmartPointer( this ); + getSmartPointersRegister< DeviceType >().insert( this ); return true; } diff --git a/src/TNL/Pointers/SharedPointerCuda.h b/src/TNL/Pointers/SharedPointerCuda.h index 27a4aabef4ed7e1383ae5f141e1c7e56a6f5d350..54dd4ee3c71c7eb461de7d8c906fd42dc96af1c6 100644 --- a/src/TNL/Pointers/SharedPointerCuda.h +++ b/src/TNL/Pointers/SharedPointerCuda.h @@ -16,6 +16,7 @@ #include <TNL/Devices/Cuda.h> #include <TNL/Pointers/SmartPointer.h> +#include <TNL/Pointers/SmartPointersRegister.h> #include <TNL/Cuda/MemoryHelpers.h> #include <cstring> // std::memcpy, std::memcmp @@ -546,7 +547,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer ~SharedPointer() { this->free(); - Devices::Cuda::removeSmartPointer( this ); + getSmartPointersRegister< DeviceType >().remove( this ); } protected: @@ -577,7 +578,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer #ifdef TNL_DEBUG_SHARED_POINTERS std::cerr << "Created shared pointer to " << getType< ObjectType >() << " (cuda_pointer = " << this->cuda_pointer << ")" << std::endl; #endif - Devices::Cuda::insertSmartPointer( this ); + getSmartPointersRegister< DeviceType >().insert( this ); return true; } diff --git a/src/TNL/Pointers/SmartPointersRegister.h b/src/TNL/Pointers/SmartPointersRegister.h index ad716b9c036ee011dca583b2557247ce0d110453..5094c1c0ee26887262f02656be7efd9f6b923345 100644 --- a/src/TNL/Pointers/SmartPointersRegister.h +++ b/src/TNL/Pointers/SmartPointersRegister.h @@ -2,7 +2,7 @@ SmartPointersRegister.h - description ------------------- begin : Apr 29, 2016 - copyright : (C) 2016 by Tomas Oberhuber + copyright : (C) 2016 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ @@ -12,24 +12,43 @@ #include <unordered_set> #include <unordered_map> + #include <TNL/Pointers/SmartPointer.h> -#include <TNL/Assert.h> +#include <TNL/Timer.h> +#include <TNL/Cuda/DeviceInfo.h> namespace TNL { namespace Pointers { +// Since TNL currently supports only execution on host (which does not need +// to register and synchronize smart pointers) and CUDA GPU's, the smart +// pointers register is implemented only for CUDA. If more execution types +// which need to register smart pointers are implemented in the future, this +// should beome a class template specialization. class SmartPointersRegister { public: - void insert( SmartPointer* pointer, int deviceId ) + /** + * Negative deviceId means that \ref Cuda::DeviceInfo::getActiveDevice will be + * called to get the device ID. + */ + void insert( SmartPointer* pointer, int deviceId = -1 ) { + if( deviceId < 0 ) + deviceId = Cuda::DeviceInfo::getActiveDevice(); pointersOnDevices[ deviceId ].insert( pointer ); } - void remove( SmartPointer* pointer, int deviceId ) + /** + * Negative deviceId means that \ref Cuda::DeviceInfo::getActiveDevice will be + * called to get the device ID. + */ + void remove( SmartPointer* pointer, int deviceId = -1 ) { + if( deviceId < 0 ) + deviceId = Cuda::DeviceInfo::getActiveDevice(); try { pointersOnDevices.at( deviceId ).erase( pointer ); } @@ -41,8 +60,14 @@ class SmartPointersRegister } } - bool synchronizeDevice( int deviceId ) + /** + * Negative deviceId means that \ref Cuda::DeviceInfo::getActiveDevice will be + * called to get the device ID. + */ + bool synchronizeDevice( int deviceId = -1 ) { + if( deviceId < 0 ) + deviceId = Cuda::DeviceInfo::getActiveDevice(); try { const auto & set = pointersOnDevices.at( deviceId ); for( auto&& it : set ) @@ -61,5 +86,34 @@ class SmartPointersRegister std::unordered_map< int, SetType > pointersOnDevices; }; + +// TODO: Device -> Allocator (in all smart pointers) +template< typename Device > +SmartPointersRegister& getSmartPointersRegister() +{ + static SmartPointersRegister reg; + return reg; +} + +template< typename Device > +Timer& getSmartPointersSynchronizationTimer() +{ + static Timer timer; + return timer; +} + +/** + * Negative deviceId means that the ID of the currently active device will be + * determined automatically. + */ +template< typename Device > +bool synchronizeSmartPointersOnDevice( int deviceId = -1 ) +{ + getSmartPointersSynchronizationTimer< Device >().start(); + bool b = getSmartPointersRegister< Device >().synchronizeDevice( deviceId ); + getSmartPointersSynchronizationTimer< Device >().stop(); + return b; +} + } // namespace Pointers } // namespace TNL diff --git a/src/TNL/Pointers/UniquePointer.h b/src/TNL/Pointers/UniquePointer.h index 9a7a2b677769fde29d49419141a76cd9cd6f4373..071de4d51132fa6b71e0e6b86ab16acd3c8269c8 100644 --- a/src/TNL/Pointers/UniquePointer.h +++ b/src/TNL/Pointers/UniquePointer.h @@ -16,6 +16,7 @@ #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Pointers/SmartPointer.h> +#include <TNL/Pointers/SmartPointersRegister.h> #include <TNL/Cuda/MemoryHelpers.h> #include <cstring> // std::memcpy, std::memcmp @@ -250,7 +251,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer ~UniquePointer() { this->free(); - Devices::Cuda::removeSmartPointer( this ); + getSmartPointersRegister< DeviceType >().remove( this ); } protected: @@ -276,7 +277,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer this->cuda_pointer = Cuda::passToDevice( this->pd->data ); // set last-sync state this->set_last_sync_state(); - Devices::Cuda::insertSmartPointer( this ); + getSmartPointersRegister< DeviceType >().insert( this ); return true; } diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h index 626469920ff9e08d7f935e13017086b7cd583081..be9e37f23d8d1fa7e1250d2ffc91ec0e8870e9fa 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h @@ -282,7 +282,7 @@ allocate_LU() U->setDimensions( N, N ); // extract raw pointer - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); const CSR* kernel_A = &A.template getData< DeviceType >(); // copy row lengths @@ -329,7 +329,7 @@ copy_triangular_factors() const int N = A->getRows(); // extract raw pointers - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); CSR* kernel_L = &L.template modifyData< DeviceType >(); CSR* kernel_U = &U.template modifyData< DeviceType >(); const CSR* kernel_A = &A.template getData< DeviceType >(); diff --git a/src/TNL/Solvers/SolverStarter_impl.h b/src/TNL/Solvers/SolverStarter_impl.h index e52d03a4f7377c5fc80ade50d7981a3a07a48f08..8b323d5d745e08216bc3477adc03d3495bce8dce 100644 --- a/src/TNL/Solvers/SolverStarter_impl.h +++ b/src/TNL/Solvers/SolverStarter_impl.h @@ -406,7 +406,7 @@ bool SolverStarter< ConfigTag > :: writeEpilog( std::ostream& str, const Solver& if( std::is_same< typename Solver::DeviceType, TNL::Devices::Cuda >::value ) { logger.writeParameter< const char* >( "GPU synchronization time:", "" ); - TNL::Devices::Cuda::getSmartPointersSynchronizationTimer().writeLog( logger, 1 ); + Pointers::getSmartPointersSynchronizationTimer< Devices::Cuda >().writeLog( logger, 1 ); } logger.writeParameter< const char* >( "I/O time:", "" ); this->ioTimer.writeLog( logger, 1 ); diff --git a/src/UnitTests/Pointers/SharedPointerCudaTest.cu b/src/UnitTests/Pointers/SharedPointerCudaTest.cu index c0d76b2cc050d074831a4a6065d71b99ea24a7e9..83b6b4793bf6d5e17f6587b71c60261e8b80cea0 100644 --- a/src/UnitTests/Pointers/SharedPointerCudaTest.cu +++ b/src/UnitTests/Pointers/SharedPointerCudaTest.cu @@ -55,7 +55,7 @@ TEST( SharedPointerCudaTest, getDataTest ) ASSERT_EQ( ptr1->y(), 2 ); #else - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); TestType aux; @@ -89,7 +89,7 @@ TEST( SharedPointerCudaTest, getDataArrayTest ) ptr->setElement( 0, 1 ); ptr->setElement( 1, 2 ); - Devices::Cuda::synchronizeDevice(); + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); int *testArray_device, *testArray_host; cudaMalloc( ( void** ) &testArray_device, 2 * sizeof( int ) );