Refactoring: added SparseOperations.h containing the copySparseMatrix function

a813195e · Jakub Klinkovský · 4b5548b1 · a813195e · a813195e · a813195e
Commit a813195e authored 6 years ago by Jakub Klinkovský
--- a/src/TNL/Matrices/CMakeLists.txt
+++ b/src/TNL/Matrices/CMakeLists.txt
@@ -14,6 +14,8 @@ SET( headers AdEllpack.h
             Multidiagonal_impl.h
             Sparse.h
             Sparse_impl.h
+             SparseOperations.h
+             SparseOperations_impl.h
             Ellpack.h
             Ellpack_impl.h
             SlicedEllpack.h

--- a/src/TNL/Matrices/Sparse.h
+++ b/src/TNL/Matrices/Sparse.h
@@ -14,7 +14,7 @@
 #include <TNL/Matrices/SparseRow.h>

 namespace TNL {
-namespace Matrices {   
+namespace Matrices {

 template< typename Real,
          typename Device,
@@ -62,14 +62,8 @@ class Sparse : public Matrix< Real, Device, Index >
   Index maxRowLength;
 };

-
-// This cannot be a method of the Sparse class, because the implementation uses
-// methods (marked with __cuda_callable__) which are defined only on the
-// subclasses, but are not virtual methods of Sparse.
-template< typename Matrix1, typename Matrix2 >
-void copySparseMatrix( Matrix1& A, const Matrix2& B );
-
 } // namespace Matrices
 } // namespace TNL

 #include <TNL/Matrices/Sparse_impl.h>
+#include <TNL/Matrices/SparseOperations.h>
--- a/src/TNL/Matrices/SparseOperations.h
+++ b/src/TNL/Matrices/SparseOperations.h
+/***************************************************************************
+                          SparseOperations.h  -  description
+                             -------------------
+    begin                : Oct 4, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovský
+
+// Note that these functions cannot be methods of the Sparse class, because
+// their implementation uses methods (marked with __cuda_callable__) which are
+// defined only on the subclasses, but are not virtual methods of Sparse.
+
+#pragma once
+
+namespace TNL {
+namespace Matrices {
+
+template< typename Matrix1, typename Matrix2 >
+void copySparseMatrix( Matrix1& A, const Matrix2& B );
+
+} // namespace Matrices
+} // namespace TNL
+
+#include "SparseOperations_impl.h"
--- a/src/TNL/Matrices/SparseOperations_impl.h
+++ b/src/TNL/Matrices/SparseOperations_impl.h
+/***************************************************************************
+                          SparseOperations_impl.h  -  description
+                             -------------------
+    begin                : Oct 4, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovský
+
+#pragma once
+
+#include <TNL/Pointers/DevicePointer.h>
+
+namespace TNL {
+namespace Matrices {
+
+#ifdef HAVE_CUDA
+template< typename Vector, typename Matrix >
+__global__ void
+SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
+                                       const Matrix* matrix,
+                                       typename Matrix::IndexType rows,
+                                       typename Matrix::IndexType cols )
+{
+   using IndexType = typename Matrix::IndexType;
+
+   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+   const IndexType gridSize = blockDim.x * gridDim.x;
+
+   while( rowIdx < rows ) {
+      const auto max_length = matrix->getRowLengthFast( rowIdx );
+      const auto row = matrix->getRow( rowIdx );
+      IndexType length = 0;
+      for( IndexType c_j = 0; c_j < max_length; c_j++ )
+         if( row.getElementColumn( c_j ) < cols )
+            length++;
+         else
+            break;
+      rowLengths[ rowIdx ] = length;
+      rowIdx += gridSize;
+   }
+}
+
+template< typename Matrix1, typename Matrix2 >
+__global__ void
+SparseMatrixCopyKernel( Matrix1* A,
+                        const Matrix2* B,
+                        const typename Matrix2::IndexType* rowLengths,
+                        typename Matrix2::IndexType rows )
+{
+   using IndexType = typename Matrix2::IndexType;
+
+   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+   const IndexType gridSize = blockDim.x * gridDim.x;
+
+   while( rowIdx < rows ) {
+      const auto length = rowLengths[ rowIdx ];
+      const auto rowB = B->getRow( rowIdx );
+      auto rowA = A->getRow( rowIdx );
+      for( IndexType c = 0; c < length; c++ )
+         rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
+      rowIdx += gridSize;
+   }
+}
+#endif
+
+
+template< typename Matrix1, typename Matrix2 >
+void
+copySparseMatrix( Matrix1& A, const Matrix2& B )
+{
+   static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value,
+                  "The matrices must have the same RealType." );
+   static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value,
+                  "The matrices must be allocated on the same device." );
+   static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value,
+                  "The matrices must have the same IndexType." );
+
+   using RealType = typename Matrix1::RealType;
+   using DeviceType = typename Matrix1::DeviceType;
+   using IndexType = typename Matrix1::IndexType;
+
+   const IndexType rows = B.getRows();
+   const IndexType cols = B.getColumns();
+
+   A.setDimensions( rows, cols );
+
+   if( std::is_same< DeviceType, Devices::Host >::value ) {
+      // set row lengths
+      typename Matrix1::CompressedRowLengthsVector rowLengths;
+      rowLengths.setSize( rows );
+#ifdef HAVE_OPENMP
+#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
+#endif
+      for( IndexType i = 0; i < rows; i++ ) {
+         const auto max_length = B.getRowLength( i );
+         const auto row = B.getRow( i );
+         IndexType length = 0;
+         for( IndexType c_j = 0; c_j < max_length; c_j++ )
+            if( row.getElementColumn( c_j ) < cols )
+               length++;
+            else
+               break;
+         rowLengths[ i ] = length;
+      }
+      A.setCompressedRowLengths( rowLengths );
+
+#ifdef HAVE_OPENMP
+#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
+#endif
+      for( IndexType i = 0; i < rows; i++ ) {
+         const auto length = rowLengths[ i ];
+         const auto rowB = B.getRow( i );
+         auto rowA = A.getRow( i );
+         for( IndexType c = 0; c < length; c++ )
+            rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
+      }
+   }
+
+   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
+#ifdef HAVE_CUDA
+      dim3 blockSize( 256 );
+      dim3 gridSize;
+      const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
+      gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );
+
+      typename Matrix1::CompressedRowLengthsVector rowLengths;
+      rowLengths.setSize( rows );
+
+      Pointers::DevicePointer< Matrix1 > Apointer( A );
+      const Pointers::DevicePointer< const Matrix2 > Bpointer( B );
+
+      // set row lengths
+      Devices::Cuda::synchronizeDevice();
+      SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
+            rowLengths.getData(),
+            &Bpointer.template getData< TNL::Devices::Cuda >(),
+            rows,
+            cols );
+      TNL_CHECK_CUDA_DEVICE;
+      Apointer->setCompressedRowLengths( rowLengths );
+
+      // copy rows
+      Devices::Cuda::synchronizeDevice();
+      SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
+            &Apointer.template modifyData< TNL::Devices::Cuda >(),
+            &Bpointer.template getData< TNL::Devices::Cuda >(),
+            rowLengths.getData(),
+            rows );
+      TNL_CHECK_CUDA_DEVICE;
+#else
+      throw Exceptions::CudaSupportMissing();
+#endif
+   }
+}
+
+} // namespace Matrices
+} // namespace TNL
--- a/src/TNL/Matrices/Sparse_impl.h
+++ b/src/TNL/Matrices/Sparse_impl.h
@@ -11,7 +11,6 @@
 #pragma once

 #include "Sparse.h"
-#include <TNL/Pointers/DevicePointer.h>

 namespace TNL {
 namespace Matrices {
@@ -134,145 +133,5 @@ void Sparse< Real, Device, Index >::printStructure( std::ostream& str ) const
   TNL_ASSERT_TRUE( false, "Not implemented yet." );
 }

-
-#ifdef HAVE_CUDA
-template< typename Vector, typename Matrix >
-__global__ void
-SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
-                                       const Matrix* matrix,
-                                       typename Matrix::IndexType rows,
-                                       typename Matrix::IndexType cols )
-{
-   using IndexType = typename Matrix::IndexType;
-
-   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-   const IndexType gridSize = blockDim.x * gridDim.x;
-
-   while( rowIdx < rows ) {
-      const auto max_length = matrix->getRowLengthFast( rowIdx );
-      const auto row = matrix->getRow( rowIdx );
-      IndexType length = 0;
-      for( IndexType c_j = 0; c_j < max_length; c_j++ )
-         if( row.getElementColumn( c_j ) < cols )
-            length++;
-         else
-            break;
-      rowLengths[ rowIdx ] = length;
-      rowIdx += gridSize;
-   }
-}
-
-template< typename Matrix1, typename Matrix2 >
-__global__ void
-SparseMatrixCopyKernel( Matrix1* A,
-                        const Matrix2* B,
-                        const typename Matrix2::IndexType* rowLengths,
-                        typename Matrix2::IndexType rows )
-{
-   using IndexType = typename Matrix2::IndexType;
-
-   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-   const IndexType gridSize = blockDim.x * gridDim.x;
-
-   while( rowIdx < rows ) {
-      const auto length = rowLengths[ rowIdx ];
-      const auto rowB = B->getRow( rowIdx );
-      auto rowA = A->getRow( rowIdx );
-      for( IndexType c = 0; c < length; c++ )
-         rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
-      rowIdx += gridSize;
-   }
-}
-#endif
-
-template< typename Matrix1, typename Matrix2 >
-void
-copySparseMatrix( Matrix1& A, const Matrix2& B )
-{
-   static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value,
-                  "The matrices must have the same RealType." );
-   static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value,
-                  "The matrices must be allocated on the same device." );
-   static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value,
-                  "The matrices must have the same IndexType." );
-
-   using RealType = typename Matrix1::RealType;
-   using DeviceType = typename Matrix1::DeviceType;
-   using IndexType = typename Matrix1::IndexType;
-
-   const IndexType rows = B.getRows();
-   const IndexType cols = B.getColumns();
-
-   A.setDimensions( rows, cols );
-
-   if( std::is_same< DeviceType, Devices::Host >::value ) {
-      // set row lengths
-      typename Matrix1::CompressedRowLengthsVector rowLengths;
-      rowLengths.setSize( rows );
-#ifdef HAVE_OPENMP
-#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
-#endif
-      for( IndexType i = 0; i < rows; i++ ) {
-         const auto max_length = B.getRowLength( i );
-         const auto row = B.getRow( i );
-         IndexType length = 0;
-         for( IndexType c_j = 0; c_j < max_length; c_j++ )
-            if( row.getElementColumn( c_j ) < cols )
-               length++;
-            else
-               break;
-         rowLengths[ i ] = length;
-      }
-      A.setCompressedRowLengths( rowLengths );
-
-#ifdef HAVE_OPENMP
-#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
-#endif
-      for( IndexType i = 0; i < rows; i++ ) {
-         const auto length = rowLengths[ i ];
-         const auto rowB = B.getRow( i );
-         auto rowA = A.getRow( i );
-         for( IndexType c = 0; c < length; c++ )
-            rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
-      }
-   }
-
-   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
-#ifdef HAVE_CUDA
-      dim3 blockSize( 256 );
-      dim3 gridSize;
-      const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
-      gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );
-
-      typename Matrix1::CompressedRowLengthsVector rowLengths;
-      rowLengths.setSize( rows );
-
-      Pointers::DevicePointer< Matrix1 > Apointer( A );
-      const Pointers::DevicePointer< const Matrix2 > Bpointer( B );
-
-      // set row lengths
-      Devices::Cuda::synchronizeDevice();
-      SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
-            rowLengths.getData(),
-            &Bpointer.template getData< TNL::Devices::Cuda >(),
-            rows,
-            cols );
-      TNL_CHECK_CUDA_DEVICE;
-      Apointer->setCompressedRowLengths( rowLengths );
-
-      // copy rows
-      Devices::Cuda::synchronizeDevice();
-      SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
-            &Apointer.template modifyData< TNL::Devices::Cuda >(),
-            &Bpointer.template getData< TNL::Devices::Cuda >(),
-            rowLengths.getData(),
-            rows );
-      TNL_CHECK_CUDA_DEVICE;
-#else
-      throw Exceptions::CudaSupportMissing();
-#endif
-   }
-}
-
 } // namespace Matrices
 } // namespace TNL