Loading src/core/mfuncs.h +10 −0 Original line number Diff line number Diff line Loading @@ -79,6 +79,16 @@ inline int roundUpDivision( const int num, const int div ) { return num / div + ( num % div != 0 ); } #ifdef HAVE_CUDA __device__ __host__ #endif inline int roundToMultiple( int number, int multiple ) { return multiple*( number/ multiple + ( number % multiple != 0 ) ); } /*template< typename T > void swap( T& a, T& b) { Loading src/core/tnlCuda.h +9 −0 Original line number Diff line number Diff line Loading @@ -28,6 +28,8 @@ class tnlCuda { public: enum { DeviceType = tnlCudaDevice }; static tnlString getDeviceType(); #ifdef HAVE_CUDA Loading Loading @@ -63,6 +65,13 @@ static inline int getWarpSize(); template< typename ObjectType > static ObjectType* passToDevice( const ObjectType& object ); template< typename ObjectType > static ObjectType passFromDevice( const ObjectType& object ); template< typename ObjectType > static void passFromDevice( const ObjectType& deviceObject, ObjectType& hostObject ); template< typename ObjectType > static void freeFromDevice( ObjectType* object ); Loading src/core/tnlHost.h +2 −0 Original line number Diff line number Diff line Loading @@ -26,6 +26,8 @@ class tnlHost { public: enum { DeviceType = tnlHostDevice }; static tnlString getDeviceType(); #ifdef HAVE_CUDA Loading src/implementation/core/tnlCuda_impl.h +23 −1 Original line number Diff line number Diff line Loading @@ -64,7 +64,6 @@ inline int tnlCuda::getNumberOfSharedMemoryBanks() return 32; } template< typename ObjectType > ObjectType* tnlCuda::passToDevice( const ObjectType& object ) { Loading @@ -87,6 +86,29 @@ ObjectType* tnlCuda::passToDevice( const ObjectType& object ) return deviceObject; } template< typename ObjectType > ObjectType tnlCuda::passFromDevice( const ObjectType& object ) { ObjectType aux; cudaMemcpy( ( void* ) &aux, ( void* ) &object, sizeof( ObjectType ), cudaMemcpyDeviceToHost ); checkCudaDevice; return aux; } template< typename ObjectType > void tnlCuda::passFromDevice( const ObjectType& deviceObject, ObjectType& hostObject ) { cudaMemcpy( ( void* ) &hostObject, ( void* ) &deviceObject, sizeof( ObjectType ), cudaMemcpyDeviceToHost ); checkCudaDevice; } template< typename ObjectType > void tnlCuda::freeFromDevice( ObjectType* deviceObject ) { Loading src/implementation/matrices/tnlDenseMatrix_impl.h +3 −23 Original line number Diff line number Diff line Loading @@ -352,7 +352,7 @@ typename Vector::RealType tnlDenseMatrix< Real, Device, Index >::rowVectorProduc return sum; } #ifdef HAVE_CUDA /*#ifdef HAVE_CUDA template< typename Real, typename Index, typename Vector > Loading @@ -365,7 +365,7 @@ __global__ void tnlDenseMatrixVectorProductCudaKernel( tnlDenseMatrix< Real, tnl if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } #endif #endif*/ template< typename Real, typename Device, Loading @@ -389,27 +389,7 @@ void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const Vector& inVecto for( IndexType row = 0; row < this->getRows(); row++ ) outVector[ row ] = rowVectorProduct( row, inVector ); if( Device::getDevice() == tnlCudaDevice ) { #ifdef HAVE_CUDA ThisType* kernel_this = tnlCuda::passToDevice( *this ); Vector* kernel_inVector = tnlCuda::passToDevice( inVector ); Vector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlDenseMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inVector ); tnlCuda::freeFromDevice( kernel_outVector ); checkCudaDevice; #endif } tnlMatrixVectorProductCuda( *this, inVector, outVector ); } template< typename Real, Loading Loading
src/core/mfuncs.h +10 −0 Original line number Diff line number Diff line Loading @@ -79,6 +79,16 @@ inline int roundUpDivision( const int num, const int div ) { return num / div + ( num % div != 0 ); } #ifdef HAVE_CUDA __device__ __host__ #endif inline int roundToMultiple( int number, int multiple ) { return multiple*( number/ multiple + ( number % multiple != 0 ) ); } /*template< typename T > void swap( T& a, T& b) { Loading
src/core/tnlCuda.h +9 −0 Original line number Diff line number Diff line Loading @@ -28,6 +28,8 @@ class tnlCuda { public: enum { DeviceType = tnlCudaDevice }; static tnlString getDeviceType(); #ifdef HAVE_CUDA Loading Loading @@ -63,6 +65,13 @@ static inline int getWarpSize(); template< typename ObjectType > static ObjectType* passToDevice( const ObjectType& object ); template< typename ObjectType > static ObjectType passFromDevice( const ObjectType& object ); template< typename ObjectType > static void passFromDevice( const ObjectType& deviceObject, ObjectType& hostObject ); template< typename ObjectType > static void freeFromDevice( ObjectType* object ); Loading
src/core/tnlHost.h +2 −0 Original line number Diff line number Diff line Loading @@ -26,6 +26,8 @@ class tnlHost { public: enum { DeviceType = tnlHostDevice }; static tnlString getDeviceType(); #ifdef HAVE_CUDA Loading
src/implementation/core/tnlCuda_impl.h +23 −1 Original line number Diff line number Diff line Loading @@ -64,7 +64,6 @@ inline int tnlCuda::getNumberOfSharedMemoryBanks() return 32; } template< typename ObjectType > ObjectType* tnlCuda::passToDevice( const ObjectType& object ) { Loading @@ -87,6 +86,29 @@ ObjectType* tnlCuda::passToDevice( const ObjectType& object ) return deviceObject; } template< typename ObjectType > ObjectType tnlCuda::passFromDevice( const ObjectType& object ) { ObjectType aux; cudaMemcpy( ( void* ) &aux, ( void* ) &object, sizeof( ObjectType ), cudaMemcpyDeviceToHost ); checkCudaDevice; return aux; } template< typename ObjectType > void tnlCuda::passFromDevice( const ObjectType& deviceObject, ObjectType& hostObject ) { cudaMemcpy( ( void* ) &hostObject, ( void* ) &deviceObject, sizeof( ObjectType ), cudaMemcpyDeviceToHost ); checkCudaDevice; } template< typename ObjectType > void tnlCuda::freeFromDevice( ObjectType* deviceObject ) { Loading
src/implementation/matrices/tnlDenseMatrix_impl.h +3 −23 Original line number Diff line number Diff line Loading @@ -352,7 +352,7 @@ typename Vector::RealType tnlDenseMatrix< Real, Device, Index >::rowVectorProduc return sum; } #ifdef HAVE_CUDA /*#ifdef HAVE_CUDA template< typename Real, typename Index, typename Vector > Loading @@ -365,7 +365,7 @@ __global__ void tnlDenseMatrixVectorProductCudaKernel( tnlDenseMatrix< Real, tnl if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } #endif #endif*/ template< typename Real, typename Device, Loading @@ -389,27 +389,7 @@ void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const Vector& inVecto for( IndexType row = 0; row < this->getRows(); row++ ) outVector[ row ] = rowVectorProduct( row, inVector ); if( Device::getDevice() == tnlCudaDevice ) { #ifdef HAVE_CUDA ThisType* kernel_this = tnlCuda::passToDevice( *this ); Vector* kernel_inVector = tnlCuda::passToDevice( inVector ); Vector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlDenseMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inVector ); tnlCuda::freeFromDevice( kernel_outVector ); checkCudaDevice; #endif } tnlMatrixVectorProductCuda( *this, inVector, outVector ); } template< typename Real, Loading