Commit 1743358a authored by Jakub Klinkovský's avatar Jakub Klinkovský

Moved synchronization of smart pointers from Devices::Cuda into TNL::Pointers...

Moved synchronization of smart pointers from Devices::Cuda into TNL::Pointers namespace as free functions

synchronizeDevice() was renamed to synchronizeSmartPointersOnDevice()
for clarity - there are many similarly named functions in CUDA (e.g.
cudaDeviceSynchronize()).
parent 2d5176fb
......@@ -490,7 +490,7 @@ getExplicitUpdate( const RealType& time,
//std::cerr << "Setting boundary conditions..." << std::endl;
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
boundaryConditionsTemplatedCompact< MeshType, CellType, BoundaryCondition, MeshFunctionType >
......@@ -594,7 +594,7 @@ getExplicitUpdate( const RealType& time,
gridYSize / 16 + ( gridYSize % 16 != 0 ) );
*/
TNL::Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
int cudaErr;
Meshes::Traverser< MeshType, Cell > meshTraverser;
meshTraverser.template processInteriorEntities< UserData,
......
......@@ -246,7 +246,7 @@ processEntities(
IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 );
Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
dim3 gridIdx, cudaGridSize;
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
{
Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
......@@ -273,7 +273,7 @@ processEntities(
auto& pool = Cuda::StreamPool::getInstance();
const cudaStream_t& s = pool.getStream( stream );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
dim3 gridIdx, cudaGridSize;
for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
......
......@@ -130,7 +130,7 @@ benchmarkIterativeSolvers( Benchmark& benchmark,
*cudaMatrixPointer = *matrixPointer;
// synchronize shared pointers
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
#endif
using namespace Solvers::Linear;
......
......@@ -10,13 +10,7 @@
#pragma once
#include <iostream>
#include <TNL/String.h>
#include <TNL/Assert.h>
#include <TNL/Pointers/SmartPointersRegister.h>
#include <TNL/Timer.h>
#include <TNL/Cuda/CudaCallable.h>
#include <TNL/Config/ConfigDescription.h>
#include <TNL/Config/ParameterContainer.h>
......@@ -33,16 +27,6 @@ public:
static inline constexpr int getGPUTransferBufferSize();
static inline void insertSmartPointer( Pointers::SmartPointer* pointer );
static inline void removeSmartPointer( Pointers::SmartPointer* pointer );
// Negative deviceId means that CudaDeviceInfo::getActiveDevice will be
// called to get the device ID.
static inline bool synchronizeDevice( int deviceId = -1 );
static inline Timer& getSmartPointersSynchronizationTimer();
////
// When we transfer data between the GPU and the CPU we use 5 MB buffer. This
// size should ensure good performance -- see.
......@@ -50,11 +34,6 @@ public:
// We use the same buffer size even for retyping data during IO operations.
//
static constexpr std::size_t TransferBufferSize = 5 * 2<<20;
protected:
static inline Pointers::SmartPointersRegister& getSmartPointersRegister();
};
#ifdef HAVE_CUDA
......
......@@ -10,12 +10,15 @@
#pragma once
#include <iostream>
#include <TNL/Math.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Exceptions/CudaRuntimeError.h>
#include <TNL/Pointers/SmartPointersRegister.h>
namespace TNL {
namespace Devices {
......@@ -42,8 +45,8 @@ Cuda::setup( const Config::ParameterContainer& parameters,
std::cerr << "I cannot activate CUDA device number " << cudaDevice << "." << std::endl;
return false;
}
getSmartPointersSynchronizationTimer().reset();
getSmartPointersSynchronizationTimer().stop();
Pointers::getSmartPointersSynchronizationTimer< Devices::Cuda >().reset();
Pointers::getSmartPointersSynchronizationTimer< Devices::Cuda >().stop();
#endif
return true;
}
......@@ -53,42 +56,6 @@ inline constexpr int Cuda::getGPUTransferBufferSize()
return 1 << 20;
}
inline void Cuda::insertSmartPointer( Pointers::SmartPointer* pointer )
{
getSmartPointersRegister().insert( pointer, TNL::Cuda::DeviceInfo::getActiveDevice() );
}
inline void Cuda::removeSmartPointer( Pointers::SmartPointer* pointer )
{
getSmartPointersRegister().remove( pointer, TNL::Cuda::DeviceInfo::getActiveDevice() );
}
inline bool Cuda::synchronizeDevice( int deviceId )
{
#ifdef HAVE_CUDA
if( deviceId < 0 )
deviceId = TNL::Cuda::DeviceInfo::getActiveDevice();
getSmartPointersSynchronizationTimer().start();
bool b = getSmartPointersRegister().synchronizeDevice( deviceId );
getSmartPointersSynchronizationTimer().stop();
return b;
#else
return true;
#endif
}
inline Timer& Cuda::getSmartPointersSynchronizationTimer()
{
static Timer timer;
return timer;
}
inline Pointers::SmartPointersRegister& Cuda::getSmartPointersRegister()
{
static Pointers::SmartPointersRegister reg;
return reg;
}
// double-precision atomicAdd function for Maxwell and older GPUs
// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
#ifdef HAVE_CUDA
......
......@@ -25,7 +25,7 @@ initInterface( const MeshFunctionPointer& _input,
int numBlocksX = Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
dim3 blockSize( cudaBlockSize );
dim3 gridSize( numBlocksX );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
_output.template modifyData< Device >(),
_interfaceMap.template modifyData< Device >() );
......
......@@ -29,7 +29,7 @@ initInterface( const MeshFunctionPointer& _input,
int numBlocksY = Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
dim3 blockSize( cudaBlockSize, cudaBlockSize );
dim3 gridSize( numBlocksX, numBlocksY );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
_output.template modifyData< Device >(),
_interfaceMap.template modifyData< Device >(),
......
......@@ -30,7 +30,7 @@ initInterface( const MeshFunctionPointer& _input,
std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
_output.template modifyData< Device >(),
_interfaceMap.template modifyData< Device >(), vLower, vUpper );
......
......@@ -316,7 +316,7 @@ solve( const MeshPointer& mesh,
/** HERE IS FIM FOR MPI AND WITHOUT MPI **/
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(),
auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(),
blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps );
......@@ -327,7 +327,7 @@ solve( const MeshPointer& mesh,
auxPtr.swap( helpFunc );
// Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now.
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
......@@ -349,7 +349,7 @@ solve( const MeshPointer& mesh,
if( numIter%2 == 1 ) // Need to check parity for MPI overlaps to synchronize ( otherwise doesnt work )
{
helpFunc.swap( auxPtr );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
}
......
......@@ -295,14 +295,14 @@ solve( const MeshPointer& mesh,
//MeshFunctionPointer helpFunc1( mesh );
MeshFunctionPointer helpFunc( mesh );
helpFunc.template modifyData() = auxPtr.template getData();
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
int numIter = 0; // number of passages of following while cycle
while( BlockIterD ) //main body of cuda code
{
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
// main function that calculates all values in each blocks
// calculated values are in helpFunc
CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
......@@ -315,14 +315,14 @@ solve( const MeshPointer& mesh,
// Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr
auxPtr.swap( helpFunc );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
// Neighbours of blocks that calculatedBefore in this passage should calculate in the next!
// BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours)
GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
BlockIterDevice = BlockIterPom;
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
// .containsValue(1) is actually parallel reduction implemented in TNL
BlockIterD = BlockIterDevice.containsValue(1);
......@@ -340,7 +340,7 @@ solve( const MeshPointer& mesh,
// We need auxPtr to point on memory of original auxPtr (not to helpFunc)
// last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values
auxPtr.swap( helpFunc );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
}
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
......
......@@ -140,7 +140,7 @@ copySparseMatrix_impl( Matrix1& A, const Matrix2& B )
const Pointers::DevicePointer< const Matrix2 > Bpointer( B );
// set row lengths
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
rowLengths.getData(),
&Bpointer.template getData< TNL::Devices::Cuda >(),
......@@ -150,7 +150,7 @@ copySparseMatrix_impl( Matrix1& A, const Matrix2& B )
Apointer->setCompressedRowLengths( rowLengths );
// copy rows
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
&Apointer.template modifyData< TNL::Devices::Cuda >(),
&Bpointer.template getData< TNL::Devices::Cuda >(),
......
......@@ -185,7 +185,7 @@ processEntities(
auto& pool = Cuda::StreamPool::getInstance();
const cudaStream_t& s = pool.getStream( stream );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
if( processOnlyBoundaryEntities )
{
dim3 cudaBlockSize( 2 );
......
......@@ -418,8 +418,8 @@ processEntities(
Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
auto& pool = Cuda::StreamPool::getInstance();
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
const cudaStream_t& s1 = pool.getStream( stream );
const cudaStream_t& s2 = pool.getStream( stream + 1 );
dim3 gridIdx, cudaGridSize;
......@@ -486,7 +486,7 @@ processEntities(
//std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount
// << "cudaBlockCount = " << cudaBlocksCount.x << std::endl;
dim3 gridIdx, cudaGridSize;
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
{
Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
......@@ -518,7 +518,7 @@ processEntities(
auto& pool = Cuda::StreamPool::getInstance();
const cudaStream_t& s = pool.getStream( stream );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
dim3 gridIdx, cudaGridSize;
for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
......
......@@ -346,8 +346,8 @@ processEntities(
Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 );
auto& pool = Cuda::StreamPool::getInstance();
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
const cudaStream_t& s1 = pool.getStream( stream );
const cudaStream_t& s2 = pool.getStream( stream + 1 );
const cudaStream_t& s3 = pool.getStream( stream + 2 );
......@@ -458,7 +458,7 @@ processEntities(
auto& pool = Cuda::StreamPool::getInstance();
const cudaStream_t& s = pool.getStream( stream );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
dim3 gridIdx, gridSize;
for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ )
for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
......
......@@ -164,7 +164,7 @@ processBoundaryEntities( const MeshPointer& meshPointer,
const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
MeshTraverserBoundaryEntitiesKernel< EntitiesDimension, EntitiesProcessor >
<<< gridSize, blockSize >>>
( &meshPointer.template getData< Devices::Cuda >(),
......@@ -195,7 +195,7 @@ processInteriorEntities( const MeshPointer& meshPointer,
const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
MeshTraverserInteriorEntitiesKernel< EntitiesDimension, EntitiesProcessor >
<<< gridSize, blockSize >>>
( &meshPointer.template getData< Devices::Cuda >(),
......@@ -226,7 +226,7 @@ processAllEntities( const MeshPointer& meshPointer,
const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
MeshTraverserAllEntitiesKernel< EntitiesDimension, EntitiesProcessor >
<<< gridSize, blockSize >>>
( &meshPointer.template getData< Devices::Cuda >(),
......
......@@ -16,6 +16,7 @@
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Pointers/SmartPointer.h>
#include <TNL/Pointers/SmartPointersRegister.h>
#include <TNL/TypeInfo.h>
#include <TNL/Cuda/MemoryHelpers.h>
......@@ -406,7 +407,7 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
~DevicePointer()
{
this->free();
Devices::Cuda::removeSmartPointer( this );
getSmartPointersRegister< DeviceType >().remove( this );
}
protected:
......@@ -426,7 +427,7 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
this->cuda_pointer = Cuda::passToDevice( *this->pointer );
// set last-sync state
this->set_last_sync_state();
Devices::Cuda::insertSmartPointer( this );
getSmartPointersRegister< DeviceType >().insert( this );
return true;
}
......
......@@ -16,6 +16,7 @@
#include <TNL/Devices/Cuda.h>
#include <TNL/Pointers/SmartPointer.h>
#include <TNL/Pointers/SmartPointersRegister.h>
#include <TNL/Cuda/MemoryHelpers.h>
#include <cstring> // std::memcpy, std::memcmp
......@@ -546,7 +547,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
~SharedPointer()
{
this->free();
Devices::Cuda::removeSmartPointer( this );
getSmartPointersRegister< DeviceType >().remove( this );
}
protected:
......@@ -577,7 +578,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
#ifdef TNL_DEBUG_SHARED_POINTERS
std::cerr << "Created shared pointer to " << getType< ObjectType >() << " (cuda_pointer = " << this->cuda_pointer << ")" << std::endl;
#endif
Devices::Cuda::insertSmartPointer( this );
getSmartPointersRegister< DeviceType >().insert( this );
return true;
}
......
......@@ -2,7 +2,7 @@
SmartPointersRegister.h - description
-------------------
begin : Apr 29, 2016
copyright : (C) 2016 by Tomas Oberhuber
copyright : (C) 2016 by Tomas Oberhuber et al.
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
......@@ -12,24 +12,43 @@
#include <unordered_set>
#include <unordered_map>
#include <TNL/Pointers/SmartPointer.h>
#include <TNL/Assert.h>
#include <TNL/Timer.h>
#include <TNL/Cuda/DeviceInfo.h>
namespace TNL {
namespace Pointers {
// Since TNL currently supports only execution on host (which does not need
// to register and synchronize smart pointers) and CUDA GPU's, the smart
// pointers register is implemented only for CUDA. If more execution types
// which need to register smart pointers are implemented in the future, this
// should beome a class template specialization.
class SmartPointersRegister
{
public:
void insert( SmartPointer* pointer, int deviceId )
/**
* Negative deviceId means that \ref Cuda::DeviceInfo::getActiveDevice will be
* called to get the device ID.
*/
void insert( SmartPointer* pointer, int deviceId = -1 )
{
if( deviceId < 0 )
deviceId = Cuda::DeviceInfo::getActiveDevice();
pointersOnDevices[ deviceId ].insert( pointer );
}
void remove( SmartPointer* pointer, int deviceId )
/**
* Negative deviceId means that \ref Cuda::DeviceInfo::getActiveDevice will be
* called to get the device ID.
*/
void remove( SmartPointer* pointer, int deviceId = -1 )
{
if( deviceId < 0 )
deviceId = Cuda::DeviceInfo::getActiveDevice();
try {
pointersOnDevices.at( deviceId ).erase( pointer );
}
......@@ -41,8 +60,14 @@ class SmartPointersRegister
}
}
bool synchronizeDevice( int deviceId )
/**
* Negative deviceId means that \ref Cuda::DeviceInfo::getActiveDevice will be
* called to get the device ID.
*/
bool synchronizeDevice( int deviceId = -1 )
{
if( deviceId < 0 )
deviceId = Cuda::DeviceInfo::getActiveDevice();
try {
const auto & set = pointersOnDevices.at( deviceId );
for( auto&& it : set )
......@@ -61,5 +86,34 @@ class SmartPointersRegister
std::unordered_map< int, SetType > pointersOnDevices;
};
// TODO: Device -> Allocator (in all smart pointers)
template< typename Device >
SmartPointersRegister& getSmartPointersRegister()
{
static SmartPointersRegister reg;
return reg;
}
template< typename Device >
Timer& getSmartPointersSynchronizationTimer()
{
static Timer timer;
return timer;
}
/**
* Negative deviceId means that the ID of the currently active device will be
* determined automatically.
*/
template< typename Device >
bool synchronizeSmartPointersOnDevice( int deviceId = -1 )
{
getSmartPointersSynchronizationTimer< Device >().start();
bool b = getSmartPointersRegister< Device >().synchronizeDevice( deviceId );
getSmartPointersSynchronizationTimer< Device >().stop();
return b;
}
} // namespace Pointers
} // namespace TNL
......@@ -16,6 +16,7 @@
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Pointers/SmartPointer.h>
#include <TNL/Pointers/SmartPointersRegister.h>
#include <TNL/Cuda/MemoryHelpers.h>
#include <cstring> // std::memcpy, std::memcmp
......@@ -250,7 +251,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
~UniquePointer()
{
this->free();
Devices::Cuda::removeSmartPointer( this );
getSmartPointersRegister< DeviceType >().remove( this );
}
protected:
......@@ -276,7 +277,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
this->cuda_pointer = Cuda::passToDevice( this->pd->data );
// set last-sync state
this->set_last_sync_state();
Devices::Cuda::insertSmartPointer( this );
getSmartPointersRegister< DeviceType >().insert( this );
return true;
}
......
......@@ -282,7 +282,7 @@ allocate_LU()
U->setDimensions( N, N );
// extract raw pointer
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
const CSR* kernel_A = &A.template getData< DeviceType >();
// copy row lengths
......@@ -329,7 +329,7 @@ copy_triangular_factors()
const int N = A->getRows();
// extract raw pointers
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
CSR* kernel_L = &L.template modifyData< DeviceType >();
CSR* kernel_U = &U.template modifyData< DeviceType >();
const CSR* kernel_A = &A.template getData< DeviceType >();
......
......@@ -406,7 +406,7 @@ bool SolverStarter< ConfigTag > :: writeEpilog( std::ostream& str, const Solver&
if( std::is_same< typename Solver::DeviceType, TNL::Devices::Cuda >::value )
{
logger.writeParameter< const char* >( "GPU synchronization time:", "" );
TNL::Devices::Cuda::getSmartPointersSynchronizationTimer().writeLog( logger, 1 );
Pointers::getSmartPointersSynchronizationTimer< Devices::Cuda >().writeLog( logger, 1 );
}
logger.writeParameter< const char* >( "I/O time:", "" );
this->ioTimer.writeLog( logger, 1 );
......
......@@ -55,7 +55,7 @@ TEST( SharedPointerCudaTest, getDataTest )
ASSERT_EQ( ptr1->y(), 2 );
#else
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
TestType aux;
......@@ -89,7 +89,7 @@ TEST( SharedPointerCudaTest, getDataArrayTest )
ptr->setElement( 0, 1 );
ptr->setElement( 1, 2 );
Devices::Cuda::synchronizeDevice();
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
int *testArray_device, *testArray_host;
cudaMalloc( ( void** ) &testArray_device, 2 * sizeof( int ) );
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment