Loading src/TNL/Algorithms/AtomicOperations.h +22 −51 Original line number Diff line number Diff line Loading @@ -8,19 +8,13 @@ #pragma once #ifdef HAVE_CUDA #include <cuda.h> #endif #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Atomic.h> namespace TNL { namespace Algorithms { template< typename Device > struct AtomicOperations {}; struct AtomicOperations; template<> struct AtomicOperations< Devices::Host > Loading @@ -31,12 +25,17 @@ struct AtomicOperations< Devices::Host > TNL_NVCC_HD_WARNING_DISABLE template< typename Value > __cuda_callable__ static void static Value add( Value& v, const Value& a ) { #pragma omp atomic update Value old; #pragma omp atomic capture { old = v; v += a; } return old; } }; template<> Loading @@ -48,10 +47,12 @@ struct AtomicOperations< Devices::Sequential > TNL_NVCC_HD_WARNING_DISABLE template< typename Value > __cuda_callable__ static void static Value add( Value& v, const Value& a ) { const Value old = v; v += a; return old; } }; Loading @@ -60,56 +61,26 @@ struct AtomicOperations< Devices::Cuda > { template< typename Value > __cuda_callable__ static void static Value add( Value& v, const Value& a ) { #ifdef HAVE_CUDA atomicAdd( &v, a ); #endif // HAVE_CUDA } #ifdef HAVE_CUDA __device__ static void add( double& v, const double& a ) { #if __CUDA_ARCH__ < 600 unsigned long long int* v_as_ull = (unsigned long long int*) &v; unsigned long long int old = *v_as_ull, assumed; do { assumed = old; old = atomicCAS( v_as_ull, assumed, __double_as_longlong( a + __longlong_as_double( assumed ) ) ); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while( assumed != old ); #else // __CUDA_ARCH__ < 600 atomicAdd( &v, a ); #endif //__CUDA_ARCH__ < 600 } #else // HAVE_CUDA static void add( double& v, const double& a ) {} #endif // HAVE_CUDA __cuda_callable__ static void add( long int& v, const long int& a ) { #ifdef HAVE_CUDA TNL_ASSERT_TRUE( false, "Atomic add for long int is not supported on CUDA." ); #endif // HAVE_CUDA return atomicAdd( &v, a ); #else return 0; #endif } __cuda_callable__ static void static short int add( short int& v, const short int& a ) { #ifdef HAVE_CUDA TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." ); #endif // HAVE_CUDA #endif return 0; } }; } // namespace Algorithms } // namespace TNL src/TNL/Atomic.h +25 −4 Original line number Diff line number Diff line Loading @@ -14,11 +14,12 @@ #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Cuda.h> // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ < 600 __device__ double atomicAdd( double* address, double val ) Loading @@ -35,8 +36,28 @@ atomicAdd( double* address, double val ) return __longlong_as_double( old ); } } // namespace #endif __device__ long int atomicAdd( long int* address, long int val ) { unsigned long long int* address_as_unsigned = reinterpret_cast< unsigned long long int* >( address ); long int old = *address; long int assumed; do { assumed = old; long int sum = val + assumed; old = atomicCAS( address_as_unsigned, *reinterpret_cast< unsigned long long int* >( &assumed ), *reinterpret_cast< unsigned long long int* >( &sum ) ); } while( assumed != old ); return old; } } // namespace #endif namespace TNL { Loading src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h +4 −3 Original line number Diff line number Diff line Loading @@ -1054,11 +1054,12 @@ public: void addMatrix( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix, const RealType& matrixMultiplicator = 1.0, const RealType& thisMatrixMultiplicator = 1.0 ); */ template< typename Real2, typename Index2 > void getTransposition( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix, void getTransposition( const SparseSandboxMatrix< Real2, Device, Index2, MatrixType >& matrix, const RealType& matrixMultiplicator = 1.0 ); */ template< typename Vector1, typename Vector2 > bool Loading src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp +59 −14 Original line number Diff line number Diff line Loading @@ -112,7 +112,7 @@ SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAlloca this->getColumns(), this->getValues().getConstView(), this->columnIndexes.getConstView(), this->segments.getConstView() ); const_cast< SparseSandboxMatrix* >( this )->rowPointers.getView() ); } template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator > Loading Loading @@ -479,7 +479,7 @@ template< typename Function > void SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::forAllRows( Function&& function ) const { this->getConsView().forAllRows( function ); this->getConstView().forAllRows( function ); } template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator > Loading Loading @@ -521,7 +521,8 @@ SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAlloca this->sequentialForRows( 0, this->getRows(), function ); } /*template< typename Real, /* template< typename Real, template< typename, typename, typename > class Segments, typename Device, typename Index, Loading @@ -532,23 +533,67 @@ IndexAllocator2 > void SparseSandboxMatrix< Real, Device, Index, MatrixType, Rea SparseSandboxMatrix< Real2, Segments2, Device, Index2, RealAllocator2, IndexAllocator2 >& matrix, const RealType& matrixMultiplicator, const RealType& thisMatrixMultiplicator ) { } */ template< typename Real, template< typename, typename, typename > class Segments, typename Device, typename Index, typename RealAllocator, typename IndexAllocator > template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator > template< typename Real2, typename Index2 > void SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >:: getTransposition( const SparseSandboxMatrix< Real2, Device, Index2 >& matrix, SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::getTransposition( const SparseSandboxMatrix< Real2, Device, Index2, MatrixType >& matrix, const RealType& matrixMultiplicator ) { }*/ // set transposed dimensions setDimensions( matrix.getColumns(), matrix.getRows() ); // stage 1: compute row capacities for the transposition RowsCapacitiesType capacities; capacities.resize( this->getRows(), 0 ); auto capacities_view = capacities.getView(); using MatrixRowView = typename SparseSandboxMatrix< Real2, Device, Index2, MatrixType >::ConstRowView; matrix.forAllRows( [ = ] __cuda_callable__( const MatrixRowView& row ) mutable { for( IndexType c = 0; c < row.getSize(); c++ ) { // row index of the transpose = column index of the input const IndexType& transRowIdx = row.getColumnIndex( c ); if( transRowIdx < 0 ) continue; // increment the capacity for the row in the transpose Algorithms::AtomicOperations< DeviceType >::add( capacities_view[ row.getColumnIndex( c ) ], IndexType( 1 ) ); } } ); // set the row capacities setRowCapacities( capacities ); capacities.reset(); // index of the first unwritten element per row RowsCapacitiesType offsets; offsets.resize( this->getRows(), 0 ); auto offsets_view = offsets.getView(); // stage 2: copy and transpose the data auto trans_view = getView(); matrix.forAllRows( [ = ] __cuda_callable__( const MatrixRowView& row ) mutable { // row index of the input = column index of the transpose const IndexType& rowIdx = row.getRowIndex(); for( IndexType c = 0; c < row.getSize(); c++ ) { // row index of the transpose = column index of the input const IndexType& transRowIdx = row.getColumnIndex( c ); if( transRowIdx < 0 ) continue; // local index in the row of the transpose const IndexType transLocalIdx = Algorithms::AtomicOperations< DeviceType >::add( offsets_view[ transRowIdx ], IndexType( 1 ) ); // get the row in the transposed matrix and set the value auto transRow = trans_view.getRow( transRowIdx ); transRow.setElement( transLocalIdx, rowIdx, row.getValue( c ) * matrixMultiplicator ); } } ); } template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator > template< typename Vector1, typename Vector2 > Loading src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h +1 −1 Original line number Diff line number Diff line Loading @@ -83,7 +83,7 @@ public: /** * \brief The type of matrix elements. */ using RealType = Real; using RealType = std::remove_const_t< Real >; // using ComputeRealType = ComputeReal; Loading Loading
src/TNL/Algorithms/AtomicOperations.h +22 −51 Original line number Diff line number Diff line Loading @@ -8,19 +8,13 @@ #pragma once #ifdef HAVE_CUDA #include <cuda.h> #endif #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Atomic.h> namespace TNL { namespace Algorithms { template< typename Device > struct AtomicOperations {}; struct AtomicOperations; template<> struct AtomicOperations< Devices::Host > Loading @@ -31,12 +25,17 @@ struct AtomicOperations< Devices::Host > TNL_NVCC_HD_WARNING_DISABLE template< typename Value > __cuda_callable__ static void static Value add( Value& v, const Value& a ) { #pragma omp atomic update Value old; #pragma omp atomic capture { old = v; v += a; } return old; } }; template<> Loading @@ -48,10 +47,12 @@ struct AtomicOperations< Devices::Sequential > TNL_NVCC_HD_WARNING_DISABLE template< typename Value > __cuda_callable__ static void static Value add( Value& v, const Value& a ) { const Value old = v; v += a; return old; } }; Loading @@ -60,56 +61,26 @@ struct AtomicOperations< Devices::Cuda > { template< typename Value > __cuda_callable__ static void static Value add( Value& v, const Value& a ) { #ifdef HAVE_CUDA atomicAdd( &v, a ); #endif // HAVE_CUDA } #ifdef HAVE_CUDA __device__ static void add( double& v, const double& a ) { #if __CUDA_ARCH__ < 600 unsigned long long int* v_as_ull = (unsigned long long int*) &v; unsigned long long int old = *v_as_ull, assumed; do { assumed = old; old = atomicCAS( v_as_ull, assumed, __double_as_longlong( a + __longlong_as_double( assumed ) ) ); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while( assumed != old ); #else // __CUDA_ARCH__ < 600 atomicAdd( &v, a ); #endif //__CUDA_ARCH__ < 600 } #else // HAVE_CUDA static void add( double& v, const double& a ) {} #endif // HAVE_CUDA __cuda_callable__ static void add( long int& v, const long int& a ) { #ifdef HAVE_CUDA TNL_ASSERT_TRUE( false, "Atomic add for long int is not supported on CUDA." ); #endif // HAVE_CUDA return atomicAdd( &v, a ); #else return 0; #endif } __cuda_callable__ static void static short int add( short int& v, const short int& a ) { #ifdef HAVE_CUDA TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." ); #endif // HAVE_CUDA #endif return 0; } }; } // namespace Algorithms } // namespace TNL
src/TNL/Atomic.h +25 −4 Original line number Diff line number Diff line Loading @@ -14,11 +14,12 @@ #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Cuda.h> // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ < 600 __device__ double atomicAdd( double* address, double val ) Loading @@ -35,8 +36,28 @@ atomicAdd( double* address, double val ) return __longlong_as_double( old ); } } // namespace #endif __device__ long int atomicAdd( long int* address, long int val ) { unsigned long long int* address_as_unsigned = reinterpret_cast< unsigned long long int* >( address ); long int old = *address; long int assumed; do { assumed = old; long int sum = val + assumed; old = atomicCAS( address_as_unsigned, *reinterpret_cast< unsigned long long int* >( &assumed ), *reinterpret_cast< unsigned long long int* >( &sum ) ); } while( assumed != old ); return old; } } // namespace #endif namespace TNL { Loading
src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h +4 −3 Original line number Diff line number Diff line Loading @@ -1054,11 +1054,12 @@ public: void addMatrix( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix, const RealType& matrixMultiplicator = 1.0, const RealType& thisMatrixMultiplicator = 1.0 ); */ template< typename Real2, typename Index2 > void getTransposition( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix, void getTransposition( const SparseSandboxMatrix< Real2, Device, Index2, MatrixType >& matrix, const RealType& matrixMultiplicator = 1.0 ); */ template< typename Vector1, typename Vector2 > bool Loading
src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp +59 −14 Original line number Diff line number Diff line Loading @@ -112,7 +112,7 @@ SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAlloca this->getColumns(), this->getValues().getConstView(), this->columnIndexes.getConstView(), this->segments.getConstView() ); const_cast< SparseSandboxMatrix* >( this )->rowPointers.getView() ); } template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator > Loading Loading @@ -479,7 +479,7 @@ template< typename Function > void SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::forAllRows( Function&& function ) const { this->getConsView().forAllRows( function ); this->getConstView().forAllRows( function ); } template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator > Loading Loading @@ -521,7 +521,8 @@ SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAlloca this->sequentialForRows( 0, this->getRows(), function ); } /*template< typename Real, /* template< typename Real, template< typename, typename, typename > class Segments, typename Device, typename Index, Loading @@ -532,23 +533,67 @@ IndexAllocator2 > void SparseSandboxMatrix< Real, Device, Index, MatrixType, Rea SparseSandboxMatrix< Real2, Segments2, Device, Index2, RealAllocator2, IndexAllocator2 >& matrix, const RealType& matrixMultiplicator, const RealType& thisMatrixMultiplicator ) { } */ template< typename Real, template< typename, typename, typename > class Segments, typename Device, typename Index, typename RealAllocator, typename IndexAllocator > template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator > template< typename Real2, typename Index2 > void SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >:: getTransposition( const SparseSandboxMatrix< Real2, Device, Index2 >& matrix, SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::getTransposition( const SparseSandboxMatrix< Real2, Device, Index2, MatrixType >& matrix, const RealType& matrixMultiplicator ) { }*/ // set transposed dimensions setDimensions( matrix.getColumns(), matrix.getRows() ); // stage 1: compute row capacities for the transposition RowsCapacitiesType capacities; capacities.resize( this->getRows(), 0 ); auto capacities_view = capacities.getView(); using MatrixRowView = typename SparseSandboxMatrix< Real2, Device, Index2, MatrixType >::ConstRowView; matrix.forAllRows( [ = ] __cuda_callable__( const MatrixRowView& row ) mutable { for( IndexType c = 0; c < row.getSize(); c++ ) { // row index of the transpose = column index of the input const IndexType& transRowIdx = row.getColumnIndex( c ); if( transRowIdx < 0 ) continue; // increment the capacity for the row in the transpose Algorithms::AtomicOperations< DeviceType >::add( capacities_view[ row.getColumnIndex( c ) ], IndexType( 1 ) ); } } ); // set the row capacities setRowCapacities( capacities ); capacities.reset(); // index of the first unwritten element per row RowsCapacitiesType offsets; offsets.resize( this->getRows(), 0 ); auto offsets_view = offsets.getView(); // stage 2: copy and transpose the data auto trans_view = getView(); matrix.forAllRows( [ = ] __cuda_callable__( const MatrixRowView& row ) mutable { // row index of the input = column index of the transpose const IndexType& rowIdx = row.getRowIndex(); for( IndexType c = 0; c < row.getSize(); c++ ) { // row index of the transpose = column index of the input const IndexType& transRowIdx = row.getColumnIndex( c ); if( transRowIdx < 0 ) continue; // local index in the row of the transpose const IndexType transLocalIdx = Algorithms::AtomicOperations< DeviceType >::add( offsets_view[ transRowIdx ], IndexType( 1 ) ); // get the row in the transposed matrix and set the value auto transRow = trans_view.getRow( transRowIdx ); transRow.setElement( transLocalIdx, rowIdx, row.getValue( c ) * matrixMultiplicator ); } } ); } template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator > template< typename Vector1, typename Vector2 > Loading
src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h +1 −1 Original line number Diff line number Diff line Loading @@ -83,7 +83,7 @@ public: /** * \brief The type of matrix elements. */ using RealType = Real; using RealType = std::remove_const_t< Real >; // using ComputeRealType = ComputeReal; Loading