Commit a813195e authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Refactoring: added SparseOperations.h containing the copySparseMatrix function

parent 4b5548b1
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -14,6 +14,8 @@ SET( headers AdEllpack.h
             Multidiagonal_impl.h
             Sparse.h
             Sparse_impl.h
             SparseOperations.h
             SparseOperations_impl.h
             Ellpack.h
             Ellpack_impl.h
             SlicedEllpack.h
+2 −8
Original line number Diff line number Diff line
@@ -62,14 +62,8 @@ class Sparse : public Matrix< Real, Device, Index >
   Index maxRowLength;
};


// This cannot be a method of the Sparse class, because the implementation uses
// methods (marked with __cuda_callable__) which are defined only on the
// subclasses, but are not virtual methods of Sparse.
template< typename Matrix1, typename Matrix2 >
void copySparseMatrix( Matrix1& A, const Matrix2& B );

} // namespace Matrices
} // namespace TNL

#include <TNL/Matrices/Sparse_impl.h>
#include <TNL/Matrices/SparseOperations.h>
+28 −0
Original line number Diff line number Diff line
/***************************************************************************
                          SparseOperations.h  -  description
                             -------------------
    begin                : Oct 4, 2018
    copyright            : (C) 2018 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Jakub Klinkovský

// Note that these functions cannot be methods of the Sparse class, because
// their implementation uses methods (marked with __cuda_callable__) which are
// defined only on the subclasses, but are not virtual methods of Sparse.

#pragma once

namespace TNL {
namespace Matrices {

template< typename Matrix1, typename Matrix2 >
void copySparseMatrix( Matrix1& A, const Matrix2& B );

} // namespace Matrices
} // namespace TNL

#include "SparseOperations_impl.h"
+161 −0
Original line number Diff line number Diff line
/***************************************************************************
                          SparseOperations_impl.h  -  description
                             -------------------
    begin                : Oct 4, 2018
    copyright            : (C) 2018 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Jakub Klinkovský

#pragma once

#include <TNL/Pointers/DevicePointer.h>

namespace TNL {
namespace Matrices {

#ifdef HAVE_CUDA
template< typename Vector, typename Matrix >
__global__ void
SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
                                       const Matrix* matrix,
                                       typename Matrix::IndexType rows,
                                       typename Matrix::IndexType cols )
{
   using IndexType = typename Matrix::IndexType;

   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
   const IndexType gridSize = blockDim.x * gridDim.x;

   while( rowIdx < rows ) {
      const auto max_length = matrix->getRowLengthFast( rowIdx );
      const auto row = matrix->getRow( rowIdx );
      IndexType length = 0;
      for( IndexType c_j = 0; c_j < max_length; c_j++ )
         if( row.getElementColumn( c_j ) < cols )
            length++;
         else
            break;
      rowLengths[ rowIdx ] = length;
      rowIdx += gridSize;
   }
}

template< typename Matrix1, typename Matrix2 >
__global__ void
SparseMatrixCopyKernel( Matrix1* A,
                        const Matrix2* B,
                        const typename Matrix2::IndexType* rowLengths,
                        typename Matrix2::IndexType rows )
{
   using IndexType = typename Matrix2::IndexType;

   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
   const IndexType gridSize = blockDim.x * gridDim.x;

   while( rowIdx < rows ) {
      const auto length = rowLengths[ rowIdx ];
      const auto rowB = B->getRow( rowIdx );
      auto rowA = A->getRow( rowIdx );
      for( IndexType c = 0; c < length; c++ )
         rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
      rowIdx += gridSize;
   }
}
#endif


template< typename Matrix1, typename Matrix2 >
void
copySparseMatrix( Matrix1& A, const Matrix2& B )
{
   static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value,
                  "The matrices must have the same RealType." );
   static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value,
                  "The matrices must be allocated on the same device." );
   static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value,
                  "The matrices must have the same IndexType." );

   using RealType = typename Matrix1::RealType;
   using DeviceType = typename Matrix1::DeviceType;
   using IndexType = typename Matrix1::IndexType;

   const IndexType rows = B.getRows();
   const IndexType cols = B.getColumns();

   A.setDimensions( rows, cols );

   if( std::is_same< DeviceType, Devices::Host >::value ) {
      // set row lengths
      typename Matrix1::CompressedRowLengthsVector rowLengths;
      rowLengths.setSize( rows );
#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
      for( IndexType i = 0; i < rows; i++ ) {
         const auto max_length = B.getRowLength( i );
         const auto row = B.getRow( i );
         IndexType length = 0;
         for( IndexType c_j = 0; c_j < max_length; c_j++ )
            if( row.getElementColumn( c_j ) < cols )
               length++;
            else
               break;
         rowLengths[ i ] = length;
      }
      A.setCompressedRowLengths( rowLengths );

#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
      for( IndexType i = 0; i < rows; i++ ) {
         const auto length = rowLengths[ i ];
         const auto rowB = B.getRow( i );
         auto rowA = A.getRow( i );
         for( IndexType c = 0; c < length; c++ )
            rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
      }
   }

   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
#ifdef HAVE_CUDA
      dim3 blockSize( 256 );
      dim3 gridSize;
      const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
      gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );

      typename Matrix1::CompressedRowLengthsVector rowLengths;
      rowLengths.setSize( rows );

      Pointers::DevicePointer< Matrix1 > Apointer( A );
      const Pointers::DevicePointer< const Matrix2 > Bpointer( B );

      // set row lengths
      Devices::Cuda::synchronizeDevice();
      SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
            rowLengths.getData(),
            &Bpointer.template getData< TNL::Devices::Cuda >(),
            rows,
            cols );
      TNL_CHECK_CUDA_DEVICE;
      Apointer->setCompressedRowLengths( rowLengths );

      // copy rows
      Devices::Cuda::synchronizeDevice();
      SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
            &Apointer.template modifyData< TNL::Devices::Cuda >(),
            &Bpointer.template getData< TNL::Devices::Cuda >(),
            rowLengths.getData(),
            rows );
      TNL_CHECK_CUDA_DEVICE;
#else
      throw Exceptions::CudaSupportMissing();
#endif
   }
}

} // namespace Matrices
} // namespace TNL
+0 −141
Original line number Diff line number Diff line
@@ -11,7 +11,6 @@
#pragma once

#include "Sparse.h"
#include <TNL/Pointers/DevicePointer.h>

namespace TNL {
namespace Matrices {
@@ -134,145 +133,5 @@ void Sparse< Real, Device, Index >::printStructure( std::ostream& str ) const
   TNL_ASSERT_TRUE( false, "Not implemented yet." );
}


#ifdef HAVE_CUDA
template< typename Vector, typename Matrix >
__global__ void
SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
                                       const Matrix* matrix,
                                       typename Matrix::IndexType rows,
                                       typename Matrix::IndexType cols )
{
   using IndexType = typename Matrix::IndexType;

   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
   const IndexType gridSize = blockDim.x * gridDim.x;

   while( rowIdx < rows ) {
      const auto max_length = matrix->getRowLengthFast( rowIdx );
      const auto row = matrix->getRow( rowIdx );
      IndexType length = 0;
      for( IndexType c_j = 0; c_j < max_length; c_j++ )
         if( row.getElementColumn( c_j ) < cols )
            length++;
         else
            break;
      rowLengths[ rowIdx ] = length;
      rowIdx += gridSize;
   }
}

template< typename Matrix1, typename Matrix2 >
__global__ void
SparseMatrixCopyKernel( Matrix1* A,
                        const Matrix2* B,
                        const typename Matrix2::IndexType* rowLengths,
                        typename Matrix2::IndexType rows )
{
   using IndexType = typename Matrix2::IndexType;

   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
   const IndexType gridSize = blockDim.x * gridDim.x;

   while( rowIdx < rows ) {
      const auto length = rowLengths[ rowIdx ];
      const auto rowB = B->getRow( rowIdx );
      auto rowA = A->getRow( rowIdx );
      for( IndexType c = 0; c < length; c++ )
         rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
      rowIdx += gridSize;
   }
}
#endif

template< typename Matrix1, typename Matrix2 >
void
copySparseMatrix( Matrix1& A, const Matrix2& B )
{
   static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value,
                  "The matrices must have the same RealType." );
   static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value,
                  "The matrices must be allocated on the same device." );
   static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value,
                  "The matrices must have the same IndexType." );

   using RealType = typename Matrix1::RealType;
   using DeviceType = typename Matrix1::DeviceType;
   using IndexType = typename Matrix1::IndexType;

   const IndexType rows = B.getRows();
   const IndexType cols = B.getColumns();

   A.setDimensions( rows, cols );

   if( std::is_same< DeviceType, Devices::Host >::value ) {
      // set row lengths
      typename Matrix1::CompressedRowLengthsVector rowLengths;
      rowLengths.setSize( rows );
#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
      for( IndexType i = 0; i < rows; i++ ) {
         const auto max_length = B.getRowLength( i );
         const auto row = B.getRow( i );
         IndexType length = 0;
         for( IndexType c_j = 0; c_j < max_length; c_j++ )
            if( row.getElementColumn( c_j ) < cols )
               length++;
            else
               break;
         rowLengths[ i ] = length;
      }
      A.setCompressedRowLengths( rowLengths );

#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
      for( IndexType i = 0; i < rows; i++ ) {
         const auto length = rowLengths[ i ];
         const auto rowB = B.getRow( i );
         auto rowA = A.getRow( i );
         for( IndexType c = 0; c < length; c++ )
            rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
      }
   }

   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
#ifdef HAVE_CUDA
      dim3 blockSize( 256 );
      dim3 gridSize;
      const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
      gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );

      typename Matrix1::CompressedRowLengthsVector rowLengths;
      rowLengths.setSize( rows );

      Pointers::DevicePointer< Matrix1 > Apointer( A );
      const Pointers::DevicePointer< const Matrix2 > Bpointer( B );

      // set row lengths
      Devices::Cuda::synchronizeDevice();
      SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
            rowLengths.getData(),
            &Bpointer.template getData< TNL::Devices::Cuda >(),
            rows,
            cols );
      TNL_CHECK_CUDA_DEVICE;
      Apointer->setCompressedRowLengths( rowLengths );

      // copy rows
      Devices::Cuda::synchronizeDevice();
      SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
            &Apointer.template modifyData< TNL::Devices::Cuda >(),
            &Bpointer.template getData< TNL::Devices::Cuda >(),
            rowLengths.getData(),
            rows );
      TNL_CHECK_CUDA_DEVICE;
#else
      throw Exceptions::CudaSupportMissing();
#endif
   }
}

} // namespace Matrices
} // namespace TNL