Commit e1f7fc0f authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Merge branch 'JK/transposition' into 'develop'

Sparse matrix transposition

See merge request !142
parents 399fe034 4464a9dd
Loading
Loading
Loading
Loading
+22 −51
Original line number Diff line number Diff line
@@ -8,19 +8,13 @@

#pragma once

#ifdef HAVE_CUDA
   #include <cuda.h>
#endif
#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Atomic.h>

namespace TNL {
namespace Algorithms {

template< typename Device >
struct AtomicOperations
{};
struct AtomicOperations;

template<>
struct AtomicOperations< Devices::Host >
@@ -31,12 +25,17 @@ struct AtomicOperations< Devices::Host >
   TNL_NVCC_HD_WARNING_DISABLE
   template< typename Value >
   __cuda_callable__
   static void
   static Value
   add( Value& v, const Value& a )
   {
      #pragma omp atomic update
      Value old;
      #pragma omp atomic capture
      {
         old = v;
         v += a;
      }
      return old;
   }
};

template<>
@@ -48,10 +47,12 @@ struct AtomicOperations< Devices::Sequential >
   TNL_NVCC_HD_WARNING_DISABLE
   template< typename Value >
   __cuda_callable__
   static void
   static Value
   add( Value& v, const Value& a )
   {
      const Value old = v;
      v += a;
      return old;
   }
};

@@ -60,56 +61,26 @@ struct AtomicOperations< Devices::Cuda >
{
   template< typename Value >
   __cuda_callable__
   static void
   static Value
   add( Value& v, const Value& a )
   {
#ifdef HAVE_CUDA
      atomicAdd( &v, a );
#endif  // HAVE_CUDA
   }

#ifdef HAVE_CUDA
   __device__
   static void
   add( double& v, const double& a )
   {
   #if __CUDA_ARCH__ < 600
      unsigned long long int* v_as_ull = (unsigned long long int*) &v;
      unsigned long long int old = *v_as_ull, assumed;

      do {
         assumed = old;
         old = atomicCAS( v_as_ull, assumed, __double_as_longlong( a + __longlong_as_double( assumed ) ) );

         // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
      } while( assumed != old );
   #else   // __CUDA_ARCH__ < 600
      atomicAdd( &v, a );
   #endif  //__CUDA_ARCH__ < 600
   }
#else   // HAVE_CUDA
   static void
   add( double& v, const double& a )
   {}
#endif  // HAVE_CUDA

   __cuda_callable__
   static void
   add( long int& v, const long int& a )
   {
#ifdef HAVE_CUDA
      TNL_ASSERT_TRUE( false, "Atomic add for long int is not supported on CUDA." );
#endif  // HAVE_CUDA
      return atomicAdd( &v, a );
#else
      return 0;
#endif
   }

   __cuda_callable__
   static void
   static short int
   add( short int& v, const short int& a )
   {
#ifdef HAVE_CUDA
      TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." );
#endif  // HAVE_CUDA
#endif
      return 0;
   }
};

}  // namespace Algorithms
}  // namespace TNL
+25 −4
Original line number Diff line number Diff line
@@ -14,11 +14,12 @@
#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Cuda.h>

// double-precision atomicAdd function for Maxwell and older GPUs
// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
#ifdef HAVE_CUDA
   #if __CUDA_ARCH__ < 600
namespace {

// double-precision atomicAdd function for Maxwell and older GPUs
// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
   #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ < 600
__device__
double
atomicAdd( double* address, double val )
@@ -35,8 +36,28 @@ atomicAdd( double* address, double val )

   return __longlong_as_double( old );
}
}  // namespace
   #endif

__device__
long int
atomicAdd( long int* address, long int val )
{
   unsigned long long int* address_as_unsigned = reinterpret_cast< unsigned long long int* >( address );
   long int old = *address;
   long int assumed;

   do {
      assumed = old;
      long int sum = val + assumed;
      old = atomicCAS( address_as_unsigned,
                       *reinterpret_cast< unsigned long long int* >( &assumed ),
                       *reinterpret_cast< unsigned long long int* >( &sum ) );
   } while( assumed != old );

   return old;
}

}  // namespace
#endif

namespace TNL {
+4 −3
Original line number Diff line number Diff line
@@ -1054,11 +1054,12 @@ public:
   void addMatrix( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
                   const RealType& matrixMultiplicator = 1.0,
                   const RealType& thisMatrixMultiplicator = 1.0 );
   */

   template< typename Real2, typename Index2 >
   void getTransposition( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
   void
   getTransposition( const SparseSandboxMatrix< Real2, Device, Index2, MatrixType >& matrix,
                     const RealType& matrixMultiplicator = 1.0 );
    */

   template< typename Vector1, typename Vector2 >
   bool
+59 −14
Original line number Diff line number Diff line
@@ -112,7 +112,7 @@ SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAlloca
                         this->getColumns(),
                         this->getValues().getConstView(),
                         this->columnIndexes.getConstView(),
                         this->segments.getConstView() );
                         const_cast< SparseSandboxMatrix* >( this )->rowPointers.getView() );
}

template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator >
@@ -479,7 +479,7 @@ template< typename Function >
void
SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::forAllRows( Function&& function ) const
{
   this->getConsView().forAllRows( function );
   this->getConstView().forAllRows( function );
}

template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator >
@@ -521,7 +521,8 @@ SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAlloca
   this->sequentialForRows( 0, this->getRows(), function );
}

/*template< typename Real,
/*
template< typename Real,
          template< typename, typename, typename > class Segments,
          typename Device,
          typename Index,
@@ -532,23 +533,67 @@ IndexAllocator2 > void SparseSandboxMatrix< Real, Device, Index, MatrixType, Rea
SparseSandboxMatrix< Real2, Segments2, Device, Index2, RealAllocator2, IndexAllocator2 >& matrix, const RealType&
matrixMultiplicator, const RealType& thisMatrixMultiplicator )
{

}
*/

template< typename Real,
          template< typename, typename, typename > class Segments,
          typename Device,
          typename Index,
          typename RealAllocator,
          typename IndexAllocator >
template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator >
template< typename Real2, typename Index2 >
void
SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
getTransposition( const SparseSandboxMatrix< Real2, Device, Index2 >& matrix,
SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::getTransposition(
   const SparseSandboxMatrix< Real2, Device, Index2, MatrixType >& matrix,
   const RealType& matrixMultiplicator )
{

}*/
   // set transposed dimensions
   setDimensions( matrix.getColumns(), matrix.getRows() );

   // stage 1: compute row capacities for the transposition
   RowsCapacitiesType capacities;
   capacities.resize( this->getRows(), 0 );
   auto capacities_view = capacities.getView();
   using MatrixRowView = typename SparseSandboxMatrix< Real2, Device, Index2, MatrixType >::ConstRowView;
   matrix.forAllRows(
      [ = ] __cuda_callable__( const MatrixRowView& row ) mutable
      {
         for( IndexType c = 0; c < row.getSize(); c++ ) {
            // row index of the transpose = column index of the input
            const IndexType& transRowIdx = row.getColumnIndex( c );
            if( transRowIdx < 0 )
               continue;
            // increment the capacity for the row in the transpose
            Algorithms::AtomicOperations< DeviceType >::add( capacities_view[ row.getColumnIndex( c ) ], IndexType( 1 ) );
         }
      } );

   // set the row capacities
   setRowCapacities( capacities );
   capacities.reset();

   // index of the first unwritten element per row
   RowsCapacitiesType offsets;
   offsets.resize( this->getRows(), 0 );
   auto offsets_view = offsets.getView();

   // stage 2: copy and transpose the data
   auto trans_view = getView();
   matrix.forAllRows(
      [ = ] __cuda_callable__( const MatrixRowView& row ) mutable
      {
         // row index of the input = column index of the transpose
         const IndexType& rowIdx = row.getRowIndex();
         for( IndexType c = 0; c < row.getSize(); c++ ) {
            // row index of the transpose = column index of the input
            const IndexType& transRowIdx = row.getColumnIndex( c );
            if( transRowIdx < 0 )
               continue;
            // local index in the row of the transpose
            const IndexType transLocalIdx =
               Algorithms::AtomicOperations< DeviceType >::add( offsets_view[ transRowIdx ], IndexType( 1 ) );
            // get the row in the transposed matrix and set the value
            auto transRow = trans_view.getRow( transRowIdx );
            transRow.setElement( transLocalIdx, rowIdx, row.getValue( c ) * matrixMultiplicator );
         }
      } );
}

template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator >
template< typename Vector1, typename Vector2 >
+1 −1
Original line number Diff line number Diff line
@@ -83,7 +83,7 @@ public:
   /**
    * \brief The type of matrix elements.
    */
   using RealType = Real;
   using RealType = std::remove_const_t< Real >;

   // using ComputeRealType = ComputeReal;

Loading