Parallel reduction is working. (0ef66885) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp

+1 −5

Original line number	Diff line number	Diff line
		@@ -207,11 +207,7 @@ compareMemory( const Element1* destination,

		auto fetch = [=] __cuda_callable__ ( Index i ) { return ( destination[ i ] == source[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
		return Reduction< Devices::Cuda >::reduce(
		size,
		reduction, //[=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; },
		fetch, //[=] __cuda_callable__ ( Index i ) { return destination[ i ]; },
		true );
		return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );

		/*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
		return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/

+2 −5

Original line number	Diff line number	Diff line
		@@ -47,8 +47,8 @@ template< int blockSize,
		__global__ void
		__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
		CudaReductionKernel( const Result zero,
		const DataFetcher& dataFetcher,
		const Reduction& reduction,
		const DataFetcher dataFetcher,
		const Reduction reduction,
		const Index size,
		Result* output )
		{
		@@ -94,8 +94,6 @@ CudaReductionKernel( const Result zero,
		__syncthreads();

		//printf( "1: tid %d data %f \n", tid, sdata[ tid ] );

		//return;
		/***
		* Perform the parallel reduction.
		*/
		@@ -127,7 +125,6 @@ CudaReductionKernel( const Result zero,
		//printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
		}


		/***
		* This runs in one warp so it is synchronized implicitly.
		*/

+0 −16

Original line number	Diff line number	Diff line
		@@ -445,22 +445,6 @@ TYPED_TEST( ArrayOperationsTest, containsOnlyValue_cuda )
		#include "../GtestMissingError.h"
		int main( int argc, char* argv[] )
		{

		using ValueType = double;
		int size = 1000;
		ValueType hostData, deviceData, *deviceData2;
		ArrayOperations< Devices::Host >::allocateMemory( hostData, size );
		ArrayOperations< Devices::Cuda >::allocateMemory( deviceData, size );
		ArrayOperations< Devices::Cuda >::allocateMemory( deviceData2, size );

		ArrayOperations< Devices::Host >::setMemory( hostData, (ValueType) 7, size );
		ArrayOperations< Devices::Cuda >::setMemory( deviceData, (ValueType) 8, size );
		ArrayOperations< Devices::Cuda >::setMemory( deviceData2, (ValueType) 9, size );
		EXPECT_FALSE(( ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< ValueType, ValueType >( hostData, deviceData, size ) ));
		EXPECT_FALSE(( ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< ValueType, ValueType >( deviceData, hostData, size ) ));
		EXPECT_FALSE(( ArrayOperations< Devices::Cuda >::compareMemory< ValueType, ValueType >( deviceData, deviceData2, size ) ));

		return 0;
		#ifdef HAVE_GTEST
		::testing::InitGoogleTest( &argc, argv );
		return RUN_ALL_TESTS();