[WIP] New lambda function based reduction compiles but does not work in CUDA. (8c1c80e2) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/ArrayAssignment.h

+0 −2

Original line number	Diff line number	Diff line
		@@ -86,8 +86,6 @@ struct ArrayAssignment< Array, T, false >

		};



		} // namespace Algorithms
		} // namespace Containers
		} // namespace TNL

src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp

+26 −8

Original line number	Diff line number	Diff line
		@@ -205,8 +205,19 @@ compareMemory( const Element1* destination,
		{
		TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
		TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
		Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
		return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );

		Element1* d;
		cudaMalloc( ( void** ) &d, size * sizeof( Element1 ) );
		auto fetch = [=] __cuda_callable__ ( Index i ) { return d[ 0 ]; }; //( destination[ i ] == source[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
		return Reduction< Devices::Cuda >::reduce(
		size,
		reduction, //[=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; },
		fetch, //[=] __cuda_callable__ ( Index i ) { return destination[ i ]; },
		true );

		/*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
		return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/
		}

		template< typename Element,
		@@ -219,11 +230,11 @@ containsValue( const Element* data,
		{
		TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
		TNL_ASSERT_GE( size, 0, "" );

		if( size == 0 ) return false;
		//Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
		//reductionContainsValue.setValue( value );
		auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); };
		return Reduction< Devices::Cuda >::reduce( size, TNL::sum, fetch, 0.0 );
		auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a \|\| b; };
		return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false );
		}

		template< typename Element,
		@@ -237,9 +248,16 @@ containsOnlyValue( const Element* data,
		TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
		TNL_ASSERT_GE( size, 0, "" );
		if( size == 0 ) return false;
		Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;

		if( size == 0 ) return false;
		auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); };
		auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
		return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );


		/*Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;
		reductionContainsOnlyValue.setValue( value );
		return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );
		return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );*/
		}

src/TNL/Containers/Algorithms/CommonVectorOperations.h

0 → 100644

+79 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		CommonVectorOperations.h - description
		-------------------
		begin : Apr 12, 2019
		copyright : (C) 2019 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		#pragma once

		namespace TNL {
		namespace Containers {
		namespace Algorithms {

		template< typename Device >
		struct CommonVectorOperations
		{
		using DeviceType = Device;

		template< typename Vector, typename ResultType = typename Vector::RealType >
		static ResultType getVectorMax( const Vector& v );

		template< typename Vector, typename ResultType = typename Vector::RealType >
		static ResultType getVectorMin( const Vector& v );

		template< typename Vector, typename ResultType = typename Vector::RealType >
		static ResultType getVectorAbsMax( const Vector& v );

		template< typename Vector, typename ResultType = typename Vector::RealType >
		static ResultType getVectorAbsMin( const Vector& v );

		template< typename Vector, typename ResultType = typename Vector::RealType >
		static ResultType getVectorL1Norm( const Vector& v );

		template< typename Vector, typename ResultType = typename Vector::RealType >
		static ResultType getVectorL2Norm( const Vector& v );

		template< typename Vector, typename ResultType = typename Vector::RealType, typename Scalar >
		static ResultType getVectorLpNorm( const Vector& v, const Scalar p );

		template< typename Vector, typename ResultType = typename Vector::RealType >
		static ResultType getVectorSum( const Vector& v );

		template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
		static ResultType getVectorDifferenceMax( const Vector1& v1, const Vector2& v2 );

		template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
		static ResultType getVectorDifferenceMin( const Vector1& v1, const Vector2& v2 );

		template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
		static ResultType getVectorDifferenceAbsMax( const Vector1& v1, const Vector2& v2 );

		template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
		static ResultType getVectorDifferenceAbsMin( const Vector1& v1, const Vector2& v2 );

		template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
		static ResultType getVectorDifferenceL1Norm( const Vector1& v1, const Vector2& v2 );

		template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
		static ResultType getVectorDifferenceL2Norm( const Vector1& v1, const Vector2& v2 );

		template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType, typename Scalar >
		static ResultType getVectorDifferenceLpNorm( const Vector1& v1, const Vector2& v2, const Scalar p );

		template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
		static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 );

		template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
		static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );

		};

		} // namespace Algorithms
		} // namespace Containers
		} // namespace TNL

		#include <TNL/Containers/Algorithms/CommonVectorOperations.hpp>

src/TNL/Containers/Algorithms/CudaReductionKernel.h

+29 −33

Original line number	Diff line number	Diff line
		@@ -40,20 +40,18 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi
		#endif

		template< int blockSize,
		typename Real,
		typename Result,
		typename DataFetcher,
		typename Reduction,
		typename Index,
		typename Result = decltype( std::declval< DataFetcher >( 0,0 ) ) >
		typename Index >
		__global__ void
		__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
		CudaReductionKernel( const Real& zero,
		CudaReductionKernel( const Result zero,
		const DataFetcher& dataFetcher,
		const Reduction& reduction,
		const Index size,
		Result* output )
		{
		using RealType = Real;
		using IndexType = Index;
		using ResultType = Result;

		@@ -69,11 +67,12 @@ CudaReductionKernel( const Real& zero,
		const IndexType gridSize = blockDim.x * gridDim.x;

		sdata[ tid ] = zero;

		/***
		* Read data into the shared memory. We start with the
		* sequential reduction.
		*/
		while( gid + 4 * gridSize < size )
		/while( gid + 4 gridSize < size )
		{
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
		@@ -86,15 +85,14 @@ CudaReductionKernel( const Real& zero,
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
		gid += 2 * gridSize;
		}
		}*/
		while( gid < size )
		{
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = dataFetcher( gid ); //reduction( sdata[ tid ], dataFetcher( gid ) );
		gid += gridSize;
		}
		__syncthreads();


		return;
		//printf( "1: tid %d data %f \n", tid, sdata[ tid ] );

		//return;
		@@ -104,19 +102,19 @@ CudaReductionKernel( const Real& zero,
		if( blockSize >= 1024 )
		{
		if( tid < 512 )
		reduction( sdata[ tid ], sdata[ tid + 512 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
		__syncthreads();
		}
		if( blockSize >= 512 )
		{
		if( tid < 256 )
		reduction( sdata[ tid ], sdata[ tid + 256 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
		__syncthreads();
		}
		if( blockSize >= 256 )
		{
		if( tid < 128 )
		reduction( sdata[ tid ], sdata[ tid + 128 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
		__syncthreads();
		//printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
		}
		@@ -124,7 +122,7 @@ CudaReductionKernel( const Real& zero,
		if( blockSize >= 128 )
		{
		if( tid < 64 )
		reduction( sdata[ tid ], sdata[ tid + 64 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
		__syncthreads();
		//printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
		}
		@@ -138,34 +136,34 @@ CudaReductionKernel( const Real& zero,
		volatile ResultType* vsdata = sdata;
		if( blockSize >= 64 )
		{
		reduction( vsdata[ tid ], vsdata[ tid + 32 ] );
		vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 32 ] );
		//printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
		}
		// TODO: If blocksize == 32, the following does not work
		// We do not check if tid < 16. Fix it!!!
		if( blockSize >= 32 )
		{
		reduction( vsdata[ tid ], vsdata[ tid + 16 ] );
		vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 16 ] );
		//printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 16 )
		{
		reduction( vsdata[ tid ], vsdata[ tid + 8 ] );
		vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 8 ] );
		//printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 8 )
		{
		reduction( vsdata[ tid ], vsdata[ tid + 4 ] );
		vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 4 ] );
		//printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 4 )
		{
		reduction( vsdata[ tid ], vsdata[ tid + 2 ] );
		vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 2 ] );
		//printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 2 )
		{
		reduction( vsdata[ tid ], vsdata[ tid + 1 ] );
		vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 1 ] );
		//printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
		}
		}
		@@ -181,8 +179,7 @@ CudaReductionKernel( const Real& zero,

		}

		template< typename Real,
		typename DataFetcher,
		template< typename DataFetcher,
		typename Reduction,
		typename Index,
		typename Result >
		@@ -190,10 +187,9 @@ int
		CudaReductionKernelLauncher( const Index size,
		const Reduction& reduction,
		const DataFetcher& dataFetcher,
		const Real& zero,
		const Result& zero,
		Result*& output )
		{
		using RealType = Real;
		using IndexType = Index;
		using ResultType = Result;

		@@ -236,49 +232,49 @@ CudaReductionKernelLauncher( const Index size,
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 256:
		cudaFuncSetCacheConfig(CudaReductionKernel< 256, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 256 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 128:
		cudaFuncSetCacheConfig(CudaReductionKernel< 128, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 128 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 64:
		cudaFuncSetCacheConfig(CudaReductionKernel< 64, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 64 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 32:
		cudaFuncSetCacheConfig(CudaReductionKernel< 32, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 32 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 16:
		cudaFuncSetCacheConfig(CudaReductionKernel< 16, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 16 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 8:
		cudaFuncSetCacheConfig(CudaReductionKernel< 8, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 8 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 4:
		cudaFuncSetCacheConfig(CudaReductionKernel< 4, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 4 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 2:
		cudaFuncSetCacheConfig(CudaReductionKernel< 2, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 2 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);

src/TNL/Containers/Algorithms/Reduction.h

+6 −6

Original line number	Diff line number	Diff line
		@@ -30,9 +30,9 @@ class Reduction< Devices::Cuda >
		{
		public:
		template< typename Index,
		typename Result,
		typename ReductionOperation,
		typename DataFetcher,
		typename Result = decltype( DataFetcher::operator() ) >
		typename DataFetcher >
		static Result
		reduce( const Index size,
		ReductionOperation& reduction,
		@@ -45,9 +45,9 @@ class Reduction< Devices::Host >
		{
		public:
		template< typename Index,
		typename Result,
		typename ReductionOperation,
		typename DataFetcher,
		typename Result = decltype( DataFetcher::operator() ) >
		typename DataFetcher >
		static Result
		reduce( const Index size,
		ReductionOperation& reduction,
		@@ -60,9 +60,9 @@ class Reduction< Devices::MIC >
		{
		public:
		template< typename Index,
		typename Result,
		typename ReductionOperation,
		typename DataFetcher,
		typename Result = decltype( DataFetcher::operator() ) >
		typename DataFetcher >
		static Result
		reduce( const Index size,
		ReductionOperation& reduction,