Added CWYGMRES solver (GMRES using Hauseholder orthogonalization and the... (caae8ef4) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CMakeLists.txt

+5 −1

Original line number	Diff line number	Diff line
		@@ -6,7 +6,11 @@ set( headers cuda-prefix-sum.h
		CudaReduction.h
		CudaReduction_impl.h
		CudaReductionBuffer.h
		CublasWrapper.h )
		CublasWrapper.h
		CudaMultireductionKernel.h
		Multireduction.h
		Multireduction_impl.h
		)

		SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL/Containers/Algorithms )
		IF( BUILD_CUDA )

src/TNL/Containers/Algorithms/CudaMultireductionKernel.h

0 → 100644

+155 −0

Original line number	Diff line number	Diff line
		#pragma once

		namespace TNL {
		namespace Containers {
		namespace Algorithms {

		#ifdef HAVE_CUDA
		/****
		* The performance of this kernel is very sensitive to register usage.
		* Compile with --ptxas-options=-v and configure these constants for given
		* architecture so that there are no local memory spills.
		*/
		static constexpr int Multireduction_maxThreadsPerBlock = 256; // must be a power of 2
		#if (__CUDA_ARCH__ >= 300 )
		static constexpr int Multireduction_minBlocksPerMultiprocessor = 6;
		#else
		static constexpr int Multireduction_minBlocksPerMultiprocessor = 4;
		#endif

		template< typename Operation, int blockSizeX >
		__global__ void
		__launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
		CudaMultireductionKernel( Operation& operation,
		const typename Operation::IndexType n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* input1,
		const typename Operation::IndexType ldInput1,
		const typename Operation::RealType* input2,
		typename Operation::ResultType* output )
		{
		typedef typename Operation::IndexType IndexType;
		typedef typename Operation::ResultType ResultType;

		extern __shared__ __align__ ( 8 ) char __sdata[];

		ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );

		/***
		* Get thread id (tid) and global element id (gid).
		* gridSizeX is the number of elements in the direction of x-axis
		* processed by all blocks at the same time.
		*/
		const IndexType tid = threadIdx.y * blockDim.x + threadIdx.x;
		IndexType gid = blockIdx.x * blockDim.x + threadIdx.x;
		const IndexType gridSizeX = blockDim.x * gridDim.x;

		/***
		* Shift input1 and output pointers.
		*/
		const IndexType y = blockIdx.y * blockDim.y + threadIdx.y;
		if( y < n ) {
		input1 += y * ldInput1;
		output += y * gridDim.x;
		}
		else
		return;

		/***
		* Start with the sequential reduction and push the
		* result into the shared memory.
		*/
		sdata[ tid ] = operation.initialValue();
		while( gid + 4 * gridSizeX < size )
		{
		operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 );
		operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 );
		operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 );
		operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 );
		gid += 4 * gridSizeX;
		}
		while( gid + 2 * gridSizeX < size )
		{
		operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 );
		operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 );
		gid += 2 * gridSizeX;
		}
		while( gid < size )
		{
		operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 );
		gid += gridSizeX;
		}
		__syncthreads();


		//printf( "1: tid %d data %f \n", tid, sdata[ tid ] );

		/***
		* Perform the parallel reduction.
		*/
		if( blockSizeX >= 1024 ) {
		if( threadIdx.x < 512 ) {
		operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
		}
		__syncthreads();
		}
		if( blockSizeX >= 512 ) {
		if( threadIdx.x < 256 ) {
		operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
		}
		__syncthreads();
		}
		if( blockSizeX >= 256 ) {
		if( threadIdx.x < 128 ) {
		operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
		}
		__syncthreads();
		}
		if( blockSizeX >= 128 ) {
		if( threadIdx.x < 64 ) {
		operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
		}
		__syncthreads();
		}

		/***
		* This runs in one warp so it is synchronized implicitly.
		*
		* When the blockSizeX is less then or equal to the warp size, the shared memory
		* must be at least 2 * blockSizeX elements per block, otherwise unallocated memory
		* will be accessed !!!
		*/
		if( threadIdx.x < 32 ) {
		volatile ResultType* vsdata = sdata;
		if( blockSizeX >= 64 ) {
		operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
		}
		if( blockSizeX >= 32 ) {
		operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
		}
		if( blockSizeX >= 16 ) {
		operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
		}
		if( blockSizeX >= 8 ) {
		operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
		}
		if( blockSizeX >= 4 ) {
		operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
		}
		if( blockSizeX >= 2 ) {
		operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
		}
		}

		/***
		* Store the result back in the global memory.
		*/
		if( threadIdx.x == 0 ) {
		output[ blockIdx.x ] = sdata[ tid ];
		}
		}
		#endif

		} // namespace Algorithms
		} // namespace Containers
		} // namespace TNL

src/TNL/Containers/Algorithms/Multireduction.h

0 → 100644

+27 −0

Original line number	Diff line number	Diff line
		#pragma once

		namespace TNL {
		namespace Containers {
		namespace Algorithms {

		template< typename Operation >
		bool multireductionOnCudaDevice( const Operation& operation,
		int n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* deviceInput1,
		const typename Operation::RealType* deviceInput2,
		typename Operation::ResultType* hostResult );

		template< typename Operation >
		bool multireductionOnHostDevice( const Operation& operation,
		int n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* deviceInput1,
		const typename Operation::RealType* deviceInput2,
		typename Operation::ResultType* hostResult );

		} // namespace Algorithms
		} // namespace Containers
		} // namespace TNL

		#include "Multireduction_impl.h"

src/TNL/Containers/Algorithms/Multireduction_impl.h

0 → 100644

+315 −0

Original line number	Diff line number	Diff line
		#pragma once

		//#define CUDA_REDUCTION_PROFILING

		#ifdef HAVE_CUDA
		#include <cuda.h>
		#endif
		#include <TNL/Assert.h>
		#include <TNL/Containers/Algorithms/reduction-operations.h>
		#include <TNL/Containers/ArrayOperations.h>
		#include <TNL/Math.h>
		#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
		#include <TNL/Containers/Algorithms/CudaMultireductionKernel.h>
		#include <TNL/Devices/CudaDeviceInfo.h>

		#ifdef CUDA_REDUCTION_PROFILING
		#include <TNL/Timer.h>
		#include <iostream>
		#endif

		namespace TNL {
		namespace Containers {
		namespace Algorithms {

		/****
		* Arrays smaller than the following constant are reduced on CPU.
		*/
		//static constexpr int Multireduction_minGpuDataSize = 16384;//65536; //16384;//1024;//256;
		// TODO: benchmarks with different values
		static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024;//256;

		#ifdef HAVE_CUDA
		template< typename Operation >
		typename Operation::IndexType
		multireduceOnCudaDevice( Operation& operation,
		int n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* input1,
		const typename Operation::IndexType ldInput1,
		const typename Operation::RealType* input2,
		typename Operation::ResultType*& output )
		{
		typedef typename Operation::IndexType IndexType;
		typedef typename Operation::RealType RealType;
		typedef typename Operation::ResultType ResultType;

		// The number of blocks should be a multiple of the number of multiprocessors
		// to ensure optimum balancing of the load. This is very important, because
		// we run the kernel with a fixed number of blocks, so the amount of work per
		// block increases with enlarging the problem, so even small imbalance can
		// cost us dearly.
		// On Tesla K40c, desGridSizeX = 4 * 6 * 15 = 360.
		// const IndexType desGridSizeX = 4 * Multireduction_minBlocksPerMultiprocessor
		// * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		// On Tesla K40c, desGridSizeX = 6 * 15 = 90.
		const IndexType desGridSizeX = Multireduction_minBlocksPerMultiprocessor
		* Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		dim3 blockSize, gridSize;

		// version A: max 16 rows of threads
		blockSize.y = min( n, 16 );

		// version B: up to 16 rows of threads, then "minimize" number of inactive rows
		// if( n <= 16 )
		// blockSize.y = n;
		// else {
		// int r = (n - 1) % 16 + 1;
		// if( r > 12 )
		// blockSize.y = 16;
		// else if( r > 8 )
		// blockSize.y = 4;
		// else if( r > 4 )
		// blockSize.y = 8;
		// else
		// blockSize.y = 4;
		// }

		// blockSize.x has to be a power of 2
		blockSize.x = Multireduction_maxThreadsPerBlock;
		while( blockSize.x * blockSize.y > Multireduction_maxThreadsPerBlock )
		blockSize.x /= 2;

		gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX );
		gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y );

		if( gridSize.y > Devices::Cuda::getMaxGridSize() ) {
		std::cerr << "Maximum gridSize.y limit exceeded (limit is 65535, attempted " << gridSize.y << ")." << std::endl;
		throw 1;
		}

		// create reference to the reduction buffer singleton and set default size
		// (make an overestimate to avoid reallocation on every call if n is increased by 1 each time)
		const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType );
		CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance();
		if( ! cudaReductionBuffer.setSize( buf_size ) )
		throw 1;
		output = cudaReductionBuffer.template getData< ResultType >();

		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		const IndexType shmem = (blockSize.x <= 32)
		? 2 * blockSize.x * blockSize.y * sizeof( ResultType )
		: blockSize.x * blockSize.y * sizeof( ResultType );

		//cout << "Multireduction of " << n << " datasets, block size (" << blockSize.x << "," << blockSize.y << "), grid size (" << gridSize.x << "," << gridSize.y << "), shmem " << shmem << endl;

		/***
		* Depending on the blockSize we generate appropriate template instance.
		*/
		switch( blockSize.x )
		{
		case 512:
		CudaMultireductionKernel< Operation, 512 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 256:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 256 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 256 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 128:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 128 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 128 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 64:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 64 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 64 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 32:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 32 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 32 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 16:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 16 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 16 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 8:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 8 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 8 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 4:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 4 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 4 >
		<<< gridSize, blockSize, shmem >>>( operation, n,size, input1, ldInput1, input2, output);
		break;
		case 2:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 2 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 2 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 1:
		Assert( false, std::cerr << "blockSize should not be 1." << std::endl );
		default:
		Assert( false, std::cerr << "Block size is " << blockSize.x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." << std::endl );
		}
		checkCudaDevice;
		return gridSize.x;
		}
		#endif

		/*
		* Parameters:
		* operation: the operation used for reduction
		* n: number of datasets to be reduced
		* size: the size of each dataset
		* deviceInput1: input array of size = n * ldInput1
		* ldInput1: leading dimension of the deviceInput1 array
		* deviceInput2: either nullptr or input array of size = size
		* hostResult: output array of size = n
		*/
		template< typename Operation >
		bool multireductionOnCudaDevice( Operation& operation,
		int n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* deviceInput1,
		const typename Operation::IndexType ldInput1,
		const typename Operation::RealType* deviceInput2,
		typename Operation::ResultType* hostResult )
		{
		#ifdef HAVE_CUDA
		Assert( n > 0, );

		typedef typename Operation::IndexType IndexType;
		typedef typename Operation::RealType RealType;
		typedef typename Operation::ResultType ResultType;
		typedef typename Operation::LaterReductionOperation LaterReductionOperation;

		/***
		* First check if the input array(s) is/are large enough for the multireduction on GPU.
		* Otherwise copy it/them to host and multireduce on CPU.
		*/
		if( n * ldInput1 < Multireduction_minGpuDataSize ) {
		RealType hostArray1[ Multireduction_minGpuDataSize ];
		// FIXME: hostArray2 is left undefined if deviceInput2 is nullptr
		RealType hostArray2[ Multireduction_minGpuDataSize ];
		if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray1, deviceInput1, n * ldInput1 ) )
		return false;
		if( deviceInput2 &&
		! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, n * size ) )
		return false;
		return multireductionOnHostDevice( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult );
		}

		#ifdef CUDA_REDUCTION_PROFILING
		Timer timer;
		timer.reset();
		timer.start();
		#endif

		/****
		* Reduce the data on the CUDA device.
		*/
		ResultType* deviceAux1 = nullptr;
		const IndexType reducedSize = multireduceOnCudaDevice( operation,
		n,
		size,
		deviceInput1,
		ldInput1,
		deviceInput2,
		deviceAux1 );
		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << endl;
		timer.reset();
		timer.start();
		#endif

		/***
		* Transfer the reduced data from device to host.
		*/
		ResultType resultArray[ n * reducedSize ];
		if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, n * reducedSize ) )
		return false;

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << endl;
		timer.reset();
		timer.start();
		#endif

		// cout << "resultArray = [";
		// for( int i = 0; i < n * reducedSize; i++ ) {
		// cout << resultArray[ i ];
		// if( i < n * reducedSize - 1 )
		// cout << ", ";
		// }
		// cout << "]" << endl;

		/***
		* Reduce the data on the host system.
		*/
		LaterReductionOperation laterReductionOperation;
		multireductionOnHostDevice( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (RealType*) nullptr, hostResult );

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << endl;
		#endif

		return checkCudaDevice;
		#else
		CudaSupportMissingMessage;
		return false;
		#endif
		};

		/*
		* Parameters:
		* operation: the operation used for reduction
		* n: number of datasets to be reduced
		* size: the size of each dataset
		* input1: input array of size = n * ldInput1
		* ldInput1: leading dimension of the input1 array
		* input2: either nullptr or input array of size = size
		* hostResult: output array of size = n
		*/
		template< typename Operation >
		bool multireductionOnHostDevice( Operation& operation,
		int n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* input1,
		const typename Operation::IndexType ldInput1,
		const typename Operation::RealType* input2,
		typename Operation::ResultType* result )
		{
		typedef typename Operation::IndexType IndexType;
		typedef typename Operation::RealType RealType;

		for( int k = 0; k < n; k++ ) {
		result[ k ] = operation.initialValue();
		const RealType* _input1 = input1 + k * ldInput1;
		for( IndexType i = 0; i < size; i++ )
		result[ k ] = operation.reduceOnHost( i, result[ k ], _input1, input2 );
		}

		return true;
		}

		} // namespace Algorithms
		} // namespace Containers
		} // namespace TNL

src/TNL/Matrices/CMakeLists.txt

+2 −1

Original line number	Diff line number	Diff line
		@@ -16,6 +16,7 @@ SET( headers Matrix.h
		ChunkedEllpack_impl.h
		CSR.h
		CSR_impl.h
		MatrixOperations.h
		MatrixReader.h
		MatrixReader_impl.h
		MatrixWriter.h