Loading src/UnitTests/Containers/ArrayTest.h +33 −17 Original line number Diff line number Diff line Loading @@ -16,9 +16,6 @@ #include <TNL/Containers/Array.h> #include <TNL/Containers/Vector.h> #include <TNL/Pointers/DevicePointer.h> #include <TNL/Pointers/SharedPointer.h> #include <TNL/Pointers/SmartPointersRegister.h> #include <TNL/Algorithms/ParallelFor.h> #include "gtest/gtest.h" Loading Loading @@ -405,21 +402,40 @@ TYPED_TEST( ArrayTest, elementwiseAccess ) testArrayElementwiseAccess( ArrayType() ); } template< typename ArrayType > void test_setElement() template< typename Value, typename Index > void test_setElement_on_device( const Array< Value, Devices::Host, Index >& ) { Pointers::SharedPointer< ArrayType > a( 10, 0 ), b( 10, 0 ); auto set = [=] __cuda_callable__ ( int i ) mutable { a->setElement( i, i ); b->setElement( i, a->getElement( i ) ); }; Pointers::synchronizeSmartPointersOnDevice< typename ArrayType::DeviceType >(); Algorithms::ParallelFor< typename ArrayType::DeviceType >::exec( 0, 10, set ); for( int i = 0; i < 10; i++ ) } #ifdef HAVE_CUDA template< typename ValueType, typename IndexType > __global__ void test_setElement_on_device_kernel( Array< ValueType, Devices::Cuda, IndexType >* a, Array< ValueType, Devices::Cuda, IndexType >* b ) { EXPECT_EQ( a->getElement( i ), i ); EXPECT_EQ( b->getElement( i ), i ); if( threadIdx.x < a->getSize() ) { a->setElement( threadIdx.x, threadIdx.x ); b->setElement( threadIdx.x, a->getElement( threadIdx.x ) ); } } #endif /* HAVE_CUDA */ template< typename Value, typename Index > void test_setElement_on_device( const Array< Value, Devices::Cuda, Index >& ) { #ifdef HAVE_CUDA using ArrayType = Array< Value, Devices::Cuda, Index >; ArrayType a( 10, 0 ), b( 10, 0 ); Pointers::DevicePointer< ArrayType > kernel_a( a ); Pointers::DevicePointer< ArrayType > kernel_b( b ); test_setElement_on_device_kernel<<< 1, 16 >>>( &kernel_a.template modifyData< Devices::Cuda >(), &kernel_b.template modifyData< Devices::Cuda >() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; for( int i = 0; i < 10; i++ ) { EXPECT_EQ( a.getElement( i ), i ); EXPECT_EQ( b.getElement( i ), i ); } #endif } TYPED_TEST( ArrayTest, setElement ) Loading @@ -433,7 +449,7 @@ TYPED_TEST( ArrayTest, setElement ) for( int i = 0; i < 10; i++ ) EXPECT_EQ( a.getElement( i ), i ); test_setElement< ArrayType >(); test_setElement_on_device( a ); } // test must be in a plain function because nvcc sucks (extended lambdas are Loading Loading
src/UnitTests/Containers/ArrayTest.h +33 −17 Original line number Diff line number Diff line Loading @@ -16,9 +16,6 @@ #include <TNL/Containers/Array.h> #include <TNL/Containers/Vector.h> #include <TNL/Pointers/DevicePointer.h> #include <TNL/Pointers/SharedPointer.h> #include <TNL/Pointers/SmartPointersRegister.h> #include <TNL/Algorithms/ParallelFor.h> #include "gtest/gtest.h" Loading Loading @@ -405,21 +402,40 @@ TYPED_TEST( ArrayTest, elementwiseAccess ) testArrayElementwiseAccess( ArrayType() ); } template< typename ArrayType > void test_setElement() template< typename Value, typename Index > void test_setElement_on_device( const Array< Value, Devices::Host, Index >& ) { Pointers::SharedPointer< ArrayType > a( 10, 0 ), b( 10, 0 ); auto set = [=] __cuda_callable__ ( int i ) mutable { a->setElement( i, i ); b->setElement( i, a->getElement( i ) ); }; Pointers::synchronizeSmartPointersOnDevice< typename ArrayType::DeviceType >(); Algorithms::ParallelFor< typename ArrayType::DeviceType >::exec( 0, 10, set ); for( int i = 0; i < 10; i++ ) } #ifdef HAVE_CUDA template< typename ValueType, typename IndexType > __global__ void test_setElement_on_device_kernel( Array< ValueType, Devices::Cuda, IndexType >* a, Array< ValueType, Devices::Cuda, IndexType >* b ) { EXPECT_EQ( a->getElement( i ), i ); EXPECT_EQ( b->getElement( i ), i ); if( threadIdx.x < a->getSize() ) { a->setElement( threadIdx.x, threadIdx.x ); b->setElement( threadIdx.x, a->getElement( threadIdx.x ) ); } } #endif /* HAVE_CUDA */ template< typename Value, typename Index > void test_setElement_on_device( const Array< Value, Devices::Cuda, Index >& ) { #ifdef HAVE_CUDA using ArrayType = Array< Value, Devices::Cuda, Index >; ArrayType a( 10, 0 ), b( 10, 0 ); Pointers::DevicePointer< ArrayType > kernel_a( a ); Pointers::DevicePointer< ArrayType > kernel_b( b ); test_setElement_on_device_kernel<<< 1, 16 >>>( &kernel_a.template modifyData< Devices::Cuda >(), &kernel_b.template modifyData< Devices::Cuda >() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; for( int i = 0; i < 10; i++ ) { EXPECT_EQ( a.getElement( i ), i ); EXPECT_EQ( b.getElement( i ), i ); } #endif } TYPED_TEST( ArrayTest, setElement ) Loading @@ -433,7 +449,7 @@ TYPED_TEST( ArrayTest, setElement ) for( int i = 0; i < 10; i++ ) EXPECT_EQ( a.getElement( i ), i ); test_setElement< ArrayType >(); test_setElement_on_device( a ); } // test must be in a plain function because nvcc sucks (extended lambdas are Loading