Loading src/core/tnlCuda.h +5 −1 Original line number Diff line number Diff line Loading @@ -52,6 +52,10 @@ class tnlCuda #endif static inline int getWarpSize(); template< typename Index > #ifdef HAVE_CUDA __device__ static Index getGlobalThreadIdx( const Index gridIdx = 0 ); #endif #ifdef HAVE_CUDA __host__ __device__ Loading src/implementation/core/cuda/cuda-reduction_impl.h +2 −1 Original line number Diff line number Diff line Loading @@ -55,6 +55,7 @@ __device__ void reduceAligned( const Operation& operation, } } /*** * For each thread in block with thread ID smaller then s this function reduces * data elements with indices tid and tid + s. This is a modified version of Loading Loading @@ -143,6 +144,7 @@ __global__ void tnlCUDAReductionKernel( const Operation operation, sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, sdata, deviceInput, deviceInput2 ); __syncthreads(); unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x; /*** * Perform the parallel reduction. Loading @@ -153,7 +155,6 @@ __global__ void tnlCUDAReductionKernel( const Operation operation, * We also separate the case when the blockDim. x is power of 2 and the algorithm * can be written in more efficient way without some conditions. */ unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x; if( n == 128 || n == 64 || n == 32 || n == 16 || n == 8 || n == 4 || n == 2 || n == 256 || n == 512 ) Loading src/implementation/core/tnlCuda_impl.h +9 −0 Original line number Diff line number Diff line Loading @@ -55,6 +55,15 @@ inline int tnlCuda::getWarpSize() return 32; } #ifdef HAVE_CUDA template< typename Index > __device__ Index tnlCuda::getGlobalThreadIdx( const Index gridIdx ) { return ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; } #endif #ifdef HAVE_CUDA __host__ __device__ #endif Loading src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h +94 −4 Original line number Diff line number Diff line Loading @@ -18,11 +18,14 @@ #ifndef TNLCHUNKEDELLPACKMATRIX_IMPL_H_ #define TNLCHUNKEDELLPACKMATRIX_IMPL_H_ #include <matrices/tnlChunkedEllpackMatrix.h> #include <core/vectors/tnlVector.h> #include <core/mfuncs.h> #ifdef HAVE_CUDA #include <cuda.h> #endif template< typename Real, typename Device, typename Index > Loading Loading @@ -179,7 +182,6 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setSlice( const RowLengthsV cerr << " maxChunkInSlice = " << maxChunkInSlice << endl ); #endif /**** * Set-up the slice info. */ Loading Loading @@ -221,8 +223,41 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setRowLengths( const RowLen return false; this->rowPointers.setElement( 0, 0 ); if( DeviceType::DeviceType == tnlHostDevice ) { for( IndexType sliceIndex = 0; sliceIndex < numberOfSlices; sliceIndex++ ) this->setSlice( rowLengths, sliceIndex, elementsToAllocation ); } if( DeviceType::DeviceType == tnlCudaDevice ) { #ifdef HAVE_CUDA typedef tnlChunkedEllpackMatrix< Real, Device, Index > Matrix; Matrix* kernel_matrix = tnlCuda::passToDevice( *this ); RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths ); Index* kernel_elementsToAllocation = tnlCuda::passToDevice( elementsToAllocation ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); const Index cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlChunkedEllpackMatrix_setSlices_CudaKernel< Real, Index, 256 > <<< cudaGridSize, cudaBlockSize, cudaBlockSize.x * sizeof( Index ) >>> ( kernel_matrix, kernel_rowLengths, numberOfSlices, kernel_elementsToAllocation, gridIdx ); } elementsToAllocation = tnlCuda::passFromDevice( *kernel_elementsToAllocation ); tnlCuda::freeFromDevice( kernel_matrix ); tnlCuda::freeFromDevice( kernel_rowLengths ); tnlCuda::freeFromDevice( kernel_elementsToAllocation ); checkCudaDevice; #endif } this->rowPointers.computePrefixSum(); Loading Loading @@ -874,4 +909,59 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlCuda > }; #ifdef HAVE_CUDA template< typename Real, typename Index, int blockSize > __global__ void tnlChunkedEllpackMatrix_setSlices_CudaKernel( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix, const typename tnlChunkedEllpackMatrix< Real, tnlCuda, Index >::RowLengthsVector* rowLengths, const Index numberOfSlices, Index* elementsToAllocation, const Index gridIdx ) { Index* threadElementsToAllocation = getSharedMemory< Index >(); const Index sliceIdx = tnlCuda::getGlobalThreadIdx< Index >( gridIdx ); if( sliceIdx < numberOfSlices ) matrix->setSlice( *rowLengths, sliceIdx, threadElementsToAllocation[ threadIdx.x ] ); else threadElementsToAllocation[ threadIdx.x ] = 0; /**** * Reduce elements to allocation from each thread to whole block */ if( blockSize >= 512 ) { if( threadIdx.x < 256 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 256 ]; __syncthreads(); } if( blockSize >= 256 ) { if( threadIdx.x < 128 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 128 ]; __syncthreads(); } if( blockSize >= 128 ) { if( threadIdx.x < 64 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 64 ]; __syncthreads(); } /*** * This runs in one warp so it is synchronised implicitly. */ if ( threadIdx.x < 32) { if( blockSize >= 64 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 32 ]; if( blockSize >= 32 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 16 ]; if( blockSize >= 16 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 8 ]; if( blockSize >= 8 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 4 ]; if( blockSize >= 4 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 2 ]; if( blockSize >= 2 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 1 ]; } atomicAdd( elementsToAllocation, threadElementsToAllocation[ 0 ] ); } #endif #endif /* TNLCHUNKEDELLPACKMATRIX_IMPL_H_ */ src/matrices/tnlChunkedEllpackMatrix.h +22 −0 Original line number Diff line number Diff line Loading @@ -25,6 +25,22 @@ template< typename Device > class tnlChunkedEllpackMatrixDeviceDependentCode; template< typename Real, typename Device = tnlHost, typename Index = int > class tnlChunkedEllpackMatrix; #ifdef HAVE_CUDA template< typename Real, typename Index, int blockSize > __global__ void tnlChunkedEllpackMatrix_setSlices_CudaKernel( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix, const typename tnlChunkedEllpackMatrix< Real, tnlCuda, Index >::RowLengthsVector* rowLengths, const Index numberOfSlices, Index* elementsToAllocation, const Index gridIdx ); #endif template< typename Real, typename Device, typename Index > class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > { public: Loading Loading @@ -209,7 +225,13 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > typedef tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode; friend class tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType >; #ifdef HAVE_CUDA friend void tnlChunkedEllpackMatrix_setSlices_CudaKernel< Real, Index, 256 >( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix, const RowLengthsVector* rowLengths, const Index numberOfSlices, Index* elementsToAllocation, const Index gridIdx ); #endif Loading Loading
src/core/tnlCuda.h +5 −1 Original line number Diff line number Diff line Loading @@ -52,6 +52,10 @@ class tnlCuda #endif static inline int getWarpSize(); template< typename Index > #ifdef HAVE_CUDA __device__ static Index getGlobalThreadIdx( const Index gridIdx = 0 ); #endif #ifdef HAVE_CUDA __host__ __device__ Loading
src/implementation/core/cuda/cuda-reduction_impl.h +2 −1 Original line number Diff line number Diff line Loading @@ -55,6 +55,7 @@ __device__ void reduceAligned( const Operation& operation, } } /*** * For each thread in block with thread ID smaller then s this function reduces * data elements with indices tid and tid + s. This is a modified version of Loading Loading @@ -143,6 +144,7 @@ __global__ void tnlCUDAReductionKernel( const Operation operation, sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, sdata, deviceInput, deviceInput2 ); __syncthreads(); unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x; /*** * Perform the parallel reduction. Loading @@ -153,7 +155,6 @@ __global__ void tnlCUDAReductionKernel( const Operation operation, * We also separate the case when the blockDim. x is power of 2 and the algorithm * can be written in more efficient way without some conditions. */ unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x; if( n == 128 || n == 64 || n == 32 || n == 16 || n == 8 || n == 4 || n == 2 || n == 256 || n == 512 ) Loading
src/implementation/core/tnlCuda_impl.h +9 −0 Original line number Diff line number Diff line Loading @@ -55,6 +55,15 @@ inline int tnlCuda::getWarpSize() return 32; } #ifdef HAVE_CUDA template< typename Index > __device__ Index tnlCuda::getGlobalThreadIdx( const Index gridIdx ) { return ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; } #endif #ifdef HAVE_CUDA __host__ __device__ #endif Loading
src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h +94 −4 Original line number Diff line number Diff line Loading @@ -18,11 +18,14 @@ #ifndef TNLCHUNKEDELLPACKMATRIX_IMPL_H_ #define TNLCHUNKEDELLPACKMATRIX_IMPL_H_ #include <matrices/tnlChunkedEllpackMatrix.h> #include <core/vectors/tnlVector.h> #include <core/mfuncs.h> #ifdef HAVE_CUDA #include <cuda.h> #endif template< typename Real, typename Device, typename Index > Loading Loading @@ -179,7 +182,6 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setSlice( const RowLengthsV cerr << " maxChunkInSlice = " << maxChunkInSlice << endl ); #endif /**** * Set-up the slice info. */ Loading Loading @@ -221,8 +223,41 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setRowLengths( const RowLen return false; this->rowPointers.setElement( 0, 0 ); if( DeviceType::DeviceType == tnlHostDevice ) { for( IndexType sliceIndex = 0; sliceIndex < numberOfSlices; sliceIndex++ ) this->setSlice( rowLengths, sliceIndex, elementsToAllocation ); } if( DeviceType::DeviceType == tnlCudaDevice ) { #ifdef HAVE_CUDA typedef tnlChunkedEllpackMatrix< Real, Device, Index > Matrix; Matrix* kernel_matrix = tnlCuda::passToDevice( *this ); RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths ); Index* kernel_elementsToAllocation = tnlCuda::passToDevice( elementsToAllocation ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); const Index cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlChunkedEllpackMatrix_setSlices_CudaKernel< Real, Index, 256 > <<< cudaGridSize, cudaBlockSize, cudaBlockSize.x * sizeof( Index ) >>> ( kernel_matrix, kernel_rowLengths, numberOfSlices, kernel_elementsToAllocation, gridIdx ); } elementsToAllocation = tnlCuda::passFromDevice( *kernel_elementsToAllocation ); tnlCuda::freeFromDevice( kernel_matrix ); tnlCuda::freeFromDevice( kernel_rowLengths ); tnlCuda::freeFromDevice( kernel_elementsToAllocation ); checkCudaDevice; #endif } this->rowPointers.computePrefixSum(); Loading Loading @@ -874,4 +909,59 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlCuda > }; #ifdef HAVE_CUDA template< typename Real, typename Index, int blockSize > __global__ void tnlChunkedEllpackMatrix_setSlices_CudaKernel( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix, const typename tnlChunkedEllpackMatrix< Real, tnlCuda, Index >::RowLengthsVector* rowLengths, const Index numberOfSlices, Index* elementsToAllocation, const Index gridIdx ) { Index* threadElementsToAllocation = getSharedMemory< Index >(); const Index sliceIdx = tnlCuda::getGlobalThreadIdx< Index >( gridIdx ); if( sliceIdx < numberOfSlices ) matrix->setSlice( *rowLengths, sliceIdx, threadElementsToAllocation[ threadIdx.x ] ); else threadElementsToAllocation[ threadIdx.x ] = 0; /**** * Reduce elements to allocation from each thread to whole block */ if( blockSize >= 512 ) { if( threadIdx.x < 256 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 256 ]; __syncthreads(); } if( blockSize >= 256 ) { if( threadIdx.x < 128 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 128 ]; __syncthreads(); } if( blockSize >= 128 ) { if( threadIdx.x < 64 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 64 ]; __syncthreads(); } /*** * This runs in one warp so it is synchronised implicitly. */ if ( threadIdx.x < 32) { if( blockSize >= 64 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 32 ]; if( blockSize >= 32 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 16 ]; if( blockSize >= 16 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 8 ]; if( blockSize >= 8 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 4 ]; if( blockSize >= 4 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 2 ]; if( blockSize >= 2 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 1 ]; } atomicAdd( elementsToAllocation, threadElementsToAllocation[ 0 ] ); } #endif #endif /* TNLCHUNKEDELLPACKMATRIX_IMPL_H_ */
src/matrices/tnlChunkedEllpackMatrix.h +22 −0 Original line number Diff line number Diff line Loading @@ -25,6 +25,22 @@ template< typename Device > class tnlChunkedEllpackMatrixDeviceDependentCode; template< typename Real, typename Device = tnlHost, typename Index = int > class tnlChunkedEllpackMatrix; #ifdef HAVE_CUDA template< typename Real, typename Index, int blockSize > __global__ void tnlChunkedEllpackMatrix_setSlices_CudaKernel( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix, const typename tnlChunkedEllpackMatrix< Real, tnlCuda, Index >::RowLengthsVector* rowLengths, const Index numberOfSlices, Index* elementsToAllocation, const Index gridIdx ); #endif template< typename Real, typename Device, typename Index > class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > { public: Loading Loading @@ -209,7 +225,13 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > typedef tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode; friend class tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType >; #ifdef HAVE_CUDA friend void tnlChunkedEllpackMatrix_setSlices_CudaKernel< Real, Index, 256 >( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix, const RowLengthsVector* rowLengths, const Index numberOfSlices, Index* elementsToAllocation, const Index gridIdx ); #endif Loading