Implementing CUDA memory operations and unit tests. (d0b3ee26) · Commits · TNL / tnl-dev

src/implementation/core/cuda/cuda-reduction_impl.h

+0 −2

Original line number	Diff line number	Diff line
		@@ -343,10 +343,8 @@ typename Operation :: IndexType reduceOnCudaDevice( const Operation& operation,
		break;
		case 1:
		tnlAssert( false, cerr << "blockSize should not be 1." << endl );
		break;
		default:
		tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
		break;
		}
		return gridSize. x;
		}

src/implementation/core/memory-operations.h

+69 −22

Original line number	Diff line number	Diff line
		@@ -108,13 +108,11 @@ bool setMemoryCuda( Element* data,
		Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
		gridSize. x = Min( blocksNumber, maxGridSize );
		setVectorValueCudaKernel<<< gridSize, blockSize >>>( data, size, value );

		return checkCudaDevice;
		#else
		cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
		return false;
		#endif

		}

		template< typename DestinationElement, typename SourceElement, typename Index >
		@@ -166,7 +164,6 @@ bool copyMemoryHostToCuda( DestinationElement* destination,
		const SourceElement* source,
		const Index size )
		{
		#ifdef HAVE_CUDA
		DestinationElement* buffer = new DestinationElement[ tnlGPUvsCPUTransferBufferSize ];
		if( ! buffer )
		{
		@@ -179,8 +176,8 @@ bool copyMemoryHostToCuda( DestinationElement* destination,
		Index j( 0 );
		while( j < tnlGPUvsCPUTransferBufferSize && i + j < size )
		buffer[ j ] = source[ i + j++ ];
		if( ! copyMemoryHostTuCuda( buffer,
		&destination[ i ],
		if( ! copyMemoryHostToCuda( &destination[ i ],
		buffer,
		j ) )
		{
		delete[] buffer;
		@@ -190,10 +187,6 @@ bool copyMemoryHostToCuda( DestinationElement* destination,
		}
		delete[] buffer;
		return true;
		#else
		cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
		return false;
		#endif
		}

		template< typename Element, typename Index >
		@@ -234,8 +227,8 @@ bool copyMemoryCudaToHost( DestinationElement* destination,
		Index i( 0 );
		while( i < size )
		{
		if( ! copyMemoryCudaToHost( &source[ i ],
		buffer,
		if( ! copyMemoryCudaToHost( buffer,
		&source[ i ],
		Min( size - i, tnlGPUvsCPUTransferBufferSize ) ) )
		{
		delete[] buffer;
		@@ -268,10 +261,63 @@ bool copyMemoryCudaToCuda( Element* destination,
		#endif
		}

		template< typename Element, typename Index >
		#ifdef HAVE_CUDA
		template< typename DestinationElement,
		typename SourceElement,
		typename Index >
		__global__ void copyMemoryCudaToCudaKernel( DestinationElement* destination,
		const SourceElement* source,
		const Index size )
		{
		Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
		const Index maxGridSize = blockDim. x * gridDim. x;
		while( elementIdx < size )
		{
		destination[ elementIdx ] = source[ elementIdx ];
		elementIdx += maxGridSize;
		}
		}
		#endif


		template< typename DestinationElement,
		typename SourceElement,
		typename Index >
		bool copyMemoryCudaToCuda( DestinationElement* destination,
		const SourceElement* source,
		const Index size,
		const Index maxGridSize )
		{
		#ifdef HAVE_CUDA
		dim3 blockSize( 0 ), gridSize( 0 );
		blockSize. x = 256;
		Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
		gridSize. x = Min( blocksNumber, maxGridSize );
		copyMemoryCudaToCudaKernel<<< gridSize, blockSize >>>( destination, source, size );
		return checkCudaDevice;
		#else
		cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
		return false;
		#endif
		}

		template< typename Element,
		typename Index >
		bool compareMemoryHost( const Element* data1,
		const Element* data2,
		const Index size )
		{
		if( memcmp( data1, data2, size * sizeof( Element ) ) != 0 )
		return false;
		return true;
		}

		template< typename Element1,
		typename Element2,
		typename Index >
		bool compareMemoryHost( const Element1* data1,
		const Element2* data2,
		const Index size )
		{
		for( Index i = 0; i < size; i ++ )
		if( data1[ i ] != data2[ i ] )
		@@ -279,15 +325,15 @@ bool compareMemoryHost( const Element* data1,
		return true;
		}

		template< typename Element, typename Index >
		bool compareMemoryHostCuda( const Element* hostData,
		const Element* deviceData,
		template< typename Element1,
		typename Element2,
		typename Index >
		bool compareMemoryHostCuda( const Element1* hostData,
		const Element2* deviceData,
		const Index size )
		{
		#ifdef HAVE_CUDA
		Index host_buffer_size = :: Min( ( Index ) ( tnlGPUvsCPUTransferBufferSize / sizeof( Element ) ),
		size );
		Element* host_buffer = new Element[ host_buffer_size ];
		Element2* host_buffer = new Element2[ tnlGPUvsCPUTransferBufferSize ];
		if( ! host_buffer )
		{
		cerr << "I am sorry but I cannot allocate supporting buffer on the host for comparing data between CUDA GPU and CPU." << endl;
		@@ -296,10 +342,10 @@ bool compareMemoryHostCuda( const Element* hostData,
		Index compared( 0 );
		while( compared < size )
		{
		Index transfer = Min( size - compared, host_buffer_size );
		Index transfer = Min( size - compared, tnlGPUvsCPUTransferBufferSize );
		if( cudaMemcpy( ( void* ) host_buffer,
		( void* ) & ( deviceData[ compared ] ),
		transfer * sizeof( Element ),
		transfer * sizeof( Element2 ),
		cudaMemcpyDeviceToHost ) != cudaSuccess )
		{
		cerr << "Transfer of data from the device failed." << endl;
		@@ -328,7 +374,8 @@ bool compareMemoryHostCuda( const Element* hostData,
		#endif
		}

		template< typename Element, typename Index >
		template< typename Element,
		typename Index >
		bool compareMemoryCuda( const Element* deviceData1,
		const Element* deviceData2,
		const Index size )

tests/unit-tests/core/CMakeLists.txt

+0 −12

Original line number	Diff line number	Diff line
		@@ -19,17 +19,5 @@ TARGET_LINK_LIBRARIES( tnlArrayTest${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
		tnl${mpiExt}${debugExt}-0.1 )

		if( BUILD_CUDA )
		# CUDA_ADD_EXECUTABLE( tnl-device-check-test${mpiExt}${debugExt} ${headers} device-check-test.cu )
		# TARGET_LINK_LIBRARIES( tnl-device-check-test${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
		# tnl${mpiExt}${debugExt}-0.1 )
		#
		# CUDA_ADD_EXECUTABLE( tnl-memory-operations-test${mpiExt}${debugExt} ${headers} memory-operations-test.cu )
		# TARGET_LINK_LIBRARIES( tnl-memory-operations-test${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
		# tnl${mpiExt}${debugExt}-0.1 )
		#
		# CUDA_ADD_EXECUTABLE( tnl-reduction-test${mpiExt}${debugExt} ${headers} reduction-test.cu )
		# TARGET_LINK_LIBRARIES( tnl-reduction-test${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
		# tnl${mpiExt}${debugExt}-0.1 )
		#
		endif()

		No newline at end of file

tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h

+125 −22

Original line number	Diff line number	Diff line
		@@ -46,10 +46,6 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase
		"allocationTest",
		&tnlCudaMemoryOperationsTester :: allocationTest )
		);
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
		"copyTest",
		&tnlCudaMemoryOperationsTester :: allocationTest )
		);
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
		"smallMemorySetTest",
		&tnlCudaMemoryOperationsTester :: smallMemorySetTest )
		@@ -58,6 +54,30 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase
		"bigMemorySetTest",
		&tnlCudaMemoryOperationsTester :: bigMemorySetTest )
		);
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
		"copyMemoryTest",
		&tnlCudaMemoryOperationsTester :: copyMemoryTest )
		);
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
		"copyMemoryWithConversionHostToCudaTest",
		&tnlCudaMemoryOperationsTester :: copyMemoryWithConversionHostToCudaTest )
		);
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
		"copyMemoryWithConversionCudaToHostTest",
		&tnlCudaMemoryOperationsTester :: copyMemoryWithConversionCudaToHostTest )
		);
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
		"copyMemoryWithConversionCudaToCudaTest",
		&tnlCudaMemoryOperationsTester :: copyMemoryWithConversionCudaToCudaTest )
		);
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
		"compareMemoryHostCudaTest",
		&tnlCudaMemoryOperationsTester :: compareMemoryHostCudaTest )
		);
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
		"compareMemoryWithConevrsionHostCudaTest",
		&tnlCudaMemoryOperationsTester :: compareMemoryWithConversionHostCudaTest )
		);

		return suiteOfTests;
		};
		@@ -72,22 +92,6 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase
		CPPUNIT_ASSERT( checkCudaDevice );
		};

		void copyTest()
		{
		const int size( 1 << 22 );
		int hostData1, hostData2, *deviceData;
		allocateMemoryHost( hostData1, size );
		allocateMemoryHost( hostData2, size );
		allocateMemoryCuda( deviceData, size );
		setMemoryHost( hostData1, 13, size );
		copyMemoryHostToCuda( deviceData, hostData1, size );
		copyMemoryCudaToHost( hostData2, deviceData, size );
		CPPUNIT_ASSERT( compareMemoryHost( hostData1, hostData2, size) );
		freeMemoryHost( hostData1 );
		freeMemoryHost( hostData2 );
		freeMemoryCuda( deviceData );
		};

		void smallMemorySetTest()
		{
		const int size( 1024 );
		@@ -108,7 +112,6 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase
		void bigMemorySetTest()
		{
		const int size( 1.1 * maxCudaGridSize * maxCudaBlockSize );
		cout << "Size = " << size << endl;
		int hostData, deviceData;
		allocateMemoryHost( hostData, size );
		allocateMemoryCuda( deviceData, size );
		@@ -120,13 +123,113 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase
		for( int i = 0; i < size; i += 100 )
		{
		if( hostData[ i ] != 13 )
		cout << " i = " << i << " " << hostData[ i ] << endl;
		CPPUNIT_ASSERT( hostData[ i ] == 13 );
		}
		freeMemoryHost( hostData );
		freeMemoryCuda( deviceData );
		};

		void copyMemoryTest()
		{
		const int size( 1 << 22 );
		int hostData1, hostData2, *deviceData;
		allocateMemoryHost( hostData1, size );
		allocateMemoryHost( hostData2, size );
		allocateMemoryCuda( deviceData, size );
		setMemoryHost( hostData1, 13, size );
		copyMemoryHostToCuda( deviceData, hostData1, size );
		copyMemoryCudaToHost( hostData2, deviceData, size );
		CPPUNIT_ASSERT( compareMemoryHost( hostData1, hostData2, size) );
		freeMemoryHost( hostData1 );
		freeMemoryHost( hostData2 );
		freeMemoryCuda( deviceData );
		};

		void copyMemoryWithConversionHostToCudaTest()
		{
		const int size( 1 << 22 );
		int *hostData1;
		float hostData2, deviceData;
		allocateMemoryHost( hostData1, size );
		allocateMemoryHost( hostData2, size );
		allocateMemoryCuda( deviceData, size );
		setMemoryHost( hostData1, 13, size );
		copyMemoryHostToCuda( deviceData, hostData1, size );
		copyMemoryCudaToHost( hostData2, deviceData, size );
		for( int i = 0; i < size; i ++ )
		CPPUNIT_ASSERT( hostData1[ i ] == hostData2[ i ] );
		freeMemoryHost( hostData1 );
		freeMemoryHost( hostData2 );
		freeMemoryCuda( deviceData );
		};

		void copyMemoryWithConversionCudaToHostTest()
		{
		const int size( 1 << 22 );
		int hostData1, deviceData;
		float *hostData2;
		allocateMemoryHost( hostData1, size );
		allocateMemoryHost( hostData2, size );
		allocateMemoryCuda( deviceData, size );
		setMemoryHost( hostData1, 13, size );
		copyMemoryHostToCuda( deviceData, hostData1, size );
		copyMemoryCudaToHost( hostData2, deviceData, size );
		for( int i = 0; i < size; i ++ )
		CPPUNIT_ASSERT( hostData1[ i ] == hostData2[ i ] );
		freeMemoryHost( hostData1 );
		freeMemoryHost( hostData2 );
		freeMemoryCuda( deviceData );
		};

		void copyMemoryWithConversionCudaToCudaTest()
		{
		const int size( 1 << 22 );
		int hostData1, deviceData1;
		float hostData2, deviceData2;
		allocateMemoryHost( hostData1, size );
		allocateMemoryHost( hostData2, size );
		allocateMemoryCuda( deviceData1, size );
		allocateMemoryCuda( deviceData2, size );
		setMemoryHost( hostData1, 13, size );
		copyMemoryHostToCuda( deviceData1, hostData1, size );
		copyMemoryCudaToCuda( deviceData2, deviceData1, size, maxCudaGridSize );
		copyMemoryCudaToHost( hostData2, deviceData2, size );
		for( int i = 0; i < size; i ++ )
		CPPUNIT_ASSERT( hostData1[ i ] == hostData2[ i ] );
		freeMemoryHost( hostData1 );
		freeMemoryHost( hostData2 );
		freeMemoryCuda( deviceData1 );
		freeMemoryCuda( deviceData2 );
		};

		void compareMemoryHostCudaTest()
		{
		const int size( 1 << 22 );
		int hostData, deviceData;
		allocateMemoryHost( hostData, size );
		allocateMemoryCuda( deviceData, size );
		setMemoryHost( hostData, 7, size );
		setMemoryCuda( deviceData, 8, size, maxCudaGridSize );
		CPPUNIT_ASSERT( ! compareMemoryHostCuda( hostData, deviceData, size ) );
		setMemoryCuda( deviceData, 7, size, maxCudaGridSize );
		CPPUNIT_ASSERT( compareMemoryHostCuda( hostData, deviceData, size ) );
		};

		void compareMemoryWithConversionHostCudaTest()
		{
		const int size( 1 << 22 );
		int *hostData;
		float *deviceData;
		allocateMemoryHost( hostData, size );
		allocateMemoryCuda( deviceData, size );
		setMemoryHost( hostData, 7, size );
		setMemoryCuda( deviceData, ( float ) 8.0, size, maxCudaGridSize );
		CPPUNIT_ASSERT( ! compareMemoryHostCuda( hostData, deviceData, size ) );
		setMemoryCuda( deviceData, ( float ) 7.0, size, maxCudaGridSize );
		CPPUNIT_ASSERT( compareMemoryHostCuda( hostData, deviceData, size ) );
		};


		};

		#else

tests/unit-tests/core/tnlArrayTest.cpp

+5 −2

Original line number	Diff line number	Diff line
		@@ -15,14 +15,17 @@
		* *
		***************************************************************************/

		#include <tnlConfig.h>
		#include <core/tnlHost.h>
		#include <cstdlib>

		#include "tnlArrayTester.h"
		#include "../tnlUnitTestStarter.h"

		int main( int argc, char* argv[] )
		{
		#ifdef HAVE_CPPUNIT
		/* if( ! tnlUnitTestStarter :: run< tnlArrayTester< char, tnlHost, int > >() \|\|
		if( ! tnlUnitTestStarter :: run< tnlArrayTester< char, tnlHost, int > >() \|\|
		! tnlUnitTestStarter :: run< tnlArrayTester< int, tnlHost, int > >() \|\|
		! tnlUnitTestStarter :: run< tnlArrayTester< long int, tnlHost, int > >() \|\|
		! tnlUnitTestStarter :: run< tnlArrayTester< float, tnlHost, int > >() \|\|
		@@ -34,7 +37,7 @@ int main( int argc, char* argv[] )
		! tnlUnitTestStarter :: run< tnlArrayTester< float, tnlHost, long int > >() \|\|
		! tnlUnitTestStarter :: run< tnlArrayTester< double, tnlHost, long int > >() \|\|
		! tnlUnitTestStarter :: run< tnlArrayTester< long double, tnlHost, long int > >() )
		return EXIT_FAILURE;*/
		return EXIT_FAILURE;
		return EXIT_SUCCESS;
		#else
		return EXIT_FAILURE;