Implementing the Chunked Ellpack matrix in CUDA. (331c083a) · Commits · TNL / tnl-dev

src/core/tnlCuda.h

+5 −1

Original line number	Diff line number	Diff line
		@@ -52,6 +52,10 @@ class tnlCuda
		#endif
		static inline int getWarpSize();

		template< typename Index >
		#ifdef HAVE_CUDA
		__device__ static Index getGlobalThreadIdx( const Index gridIdx = 0 );
		#endif

		#ifdef HAVE_CUDA
		__host__ __device__

src/implementation/core/cuda/cuda-reduction_impl.h

+2 −1

Original line number	Diff line number	Diff line
		@@ -55,6 +55,7 @@ __device__ void reduceAligned( const Operation& operation,
		}
		}


		/***
		* For each thread in block with thread ID smaller then s this function reduces
		* data elements with indices tid and tid + s. This is a modified version of
		@@ -143,6 +144,7 @@ __global__ void tnlCUDAReductionKernel( const Operation operation,
		sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, sdata, deviceInput, deviceInput2 );
		__syncthreads();

		unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x;

		/***
		* Perform the parallel reduction.
		@@ -153,7 +155,6 @@ __global__ void tnlCUDAReductionKernel( const Operation operation,
		* We also separate the case when the blockDim. x is power of 2 and the algorithm
		* can be written in more efficient way without some conditions.
		*/
		unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x;
		if( n == 128 \|\| n == 64 \|\| n == 32 \|\| n == 16 \|\|
		n == 8 \|\| n == 4 \|\| n == 2 \|\| n == 256 \|\|
		n == 512 )

src/implementation/core/tnlCuda_impl.h

+9 −0

Original line number	Diff line number	Diff line
		@@ -55,6 +55,15 @@ inline int tnlCuda::getWarpSize()
		return 32;
		}

		#ifdef HAVE_CUDA
		template< typename Index >
		__device__ Index tnlCuda::getGlobalThreadIdx( const Index gridIdx )
		{
		return ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		}
		#endif


		#ifdef HAVE_CUDA
		__host__ __device__
		#endif

src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h

+94 −4

Original line number	Diff line number	Diff line
		@@ -18,11 +18,14 @@
		#ifndef TNLCHUNKEDELLPACKMATRIX_IMPL_H_
		#define TNLCHUNKEDELLPACKMATRIX_IMPL_H_


		#include <matrices/tnlChunkedEllpackMatrix.h>
		#include <core/vectors/tnlVector.h>
		#include <core/mfuncs.h>

		#ifdef HAVE_CUDA
		#include <cuda.h>
		#endif

		template< typename Real,
		typename Device,
		typename Index >
		@@ -179,7 +182,6 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setSlice( const RowLengthsV
		cerr << " maxChunkInSlice = " << maxChunkInSlice << endl );
		#endif


		/****
		* Set-up the slice info.
		*/
		@@ -221,8 +223,41 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setRowLengths( const RowLen
		return false;

		this->rowPointers.setElement( 0, 0 );
		if( DeviceType::DeviceType == tnlHostDevice )
		{
		for( IndexType sliceIndex = 0; sliceIndex < numberOfSlices; sliceIndex++ )
		this->setSlice( rowLengths, sliceIndex, elementsToAllocation );
		}
		if( DeviceType::DeviceType == tnlCudaDevice )
		{
		#ifdef HAVE_CUDA
		typedef tnlChunkedEllpackMatrix< Real, Device, Index > Matrix;
		Matrix* kernel_matrix = tnlCuda::passToDevice( *this );
		RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths );
		Index* kernel_elementsToAllocation = tnlCuda::passToDevice( elementsToAllocation );
		dim3 cudaBlockSize( 256 ),
		cudaGridSize( tnlCuda::getMaxGridSize() );
		const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
		const Index cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
		tnlChunkedEllpackMatrix_setSlices_CudaKernel< Real, Index, 256 >
		<<< cudaGridSize, cudaBlockSize, cudaBlockSize.x * sizeof( Index ) >>>
		( kernel_matrix,
		kernel_rowLengths,
		numberOfSlices,
		kernel_elementsToAllocation,
		gridIdx );
		}
		elementsToAllocation = tnlCuda::passFromDevice( *kernel_elementsToAllocation );
		tnlCuda::freeFromDevice( kernel_matrix );
		tnlCuda::freeFromDevice( kernel_rowLengths );
		tnlCuda::freeFromDevice( kernel_elementsToAllocation );
		checkCudaDevice;
		#endif
		}

		this->rowPointers.computePrefixSum();

		@@ -874,4 +909,59 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlCuda >

		};

		#ifdef HAVE_CUDA
		template< typename Real,
		typename Index,
		int blockSize >
		__global__ void tnlChunkedEllpackMatrix_setSlices_CudaKernel( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
		const typename tnlChunkedEllpackMatrix< Real, tnlCuda, Index >::RowLengthsVector* rowLengths,
		const Index numberOfSlices,
		Index* elementsToAllocation,
		const Index gridIdx )
		{
		Index* threadElementsToAllocation = getSharedMemory< Index >();
		const Index sliceIdx = tnlCuda::getGlobalThreadIdx< Index >( gridIdx );
		if( sliceIdx < numberOfSlices )
		matrix->setSlice( *rowLengths,
		sliceIdx,
		threadElementsToAllocation[ threadIdx.x ] );
		else
		threadElementsToAllocation[ threadIdx.x ] = 0;

		/****
		* Reduce elements to allocation from each thread to whole block
		*/
		if( blockSize >= 512 )
		{
		if( threadIdx.x < 256 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 256 ];
		__syncthreads();
		}
		if( blockSize >= 256 )
		{
		if( threadIdx.x < 128 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 128 ];
		__syncthreads();
		}
		if( blockSize >= 128 )
		{
		if( threadIdx.x < 64 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 64 ];
		__syncthreads();
		}

		/***
		* This runs in one warp so it is synchronised implicitly.
		*/
		if ( threadIdx.x < 32)
		{
		if( blockSize >= 64 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 32 ];
		if( blockSize >= 32 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 16 ];
		if( blockSize >= 16 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 8 ];
		if( blockSize >= 8 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 4 ];
		if( blockSize >= 4 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 2 ];
		if( blockSize >= 2 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 1 ];
		}
		atomicAdd( elementsToAllocation, threadElementsToAllocation[ 0 ] );
		}

		#endif

		#endif /* TNLCHUNKEDELLPACKMATRIX_IMPL_H_ */

src/matrices/tnlChunkedEllpackMatrix.h

+22 −0

Original line number	Diff line number	Diff line
		@@ -25,6 +25,22 @@ template< typename Device >
		class tnlChunkedEllpackMatrixDeviceDependentCode;

		template< typename Real, typename Device = tnlHost, typename Index = int >
		class tnlChunkedEllpackMatrix;

		#ifdef HAVE_CUDA
		template< typename Real,
		typename Index,
		int blockSize >
		__global__ void tnlChunkedEllpackMatrix_setSlices_CudaKernel( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
		const typename tnlChunkedEllpackMatrix< Real, tnlCuda, Index >::RowLengthsVector* rowLengths,
		const Index numberOfSlices,
		Index* elementsToAllocation,
		const Index gridIdx );
		#endif



		template< typename Real, typename Device, typename Index >
		class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
		{
		public:
		@@ -209,7 +225,13 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >

		typedef tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
		friend class tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType >;

		#ifdef HAVE_CUDA
		friend void tnlChunkedEllpackMatrix_setSlices_CudaKernel< Real, Index, 256 >( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
		const RowLengthsVector* rowLengths,
		const Index numberOfSlices,
		Index* elementsToAllocation,
		const Index gridIdx );
		#endif