Skip to content
Snippets Groups Projects
Commit a813195e authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Refactoring: added SparseOperations.h containing the copySparseMatrix function

parent 4b5548b1
No related branches found
No related tags found
1 merge request!8Distributed linear solvers
This commit is part of merge request !8. Comments created here will be created in the context of that merge request.
......@@ -14,6 +14,8 @@ SET( headers AdEllpack.h
Multidiagonal_impl.h
Sparse.h
Sparse_impl.h
SparseOperations.h
SparseOperations_impl.h
Ellpack.h
Ellpack_impl.h
SlicedEllpack.h
......
......@@ -14,7 +14,7 @@
#include <TNL/Matrices/SparseRow.h>
namespace TNL {
namespace Matrices {
namespace Matrices {
template< typename Real,
typename Device,
......@@ -62,14 +62,8 @@ class Sparse : public Matrix< Real, Device, Index >
Index maxRowLength;
};
// This cannot be a method of the Sparse class, because the implementation uses
// methods (marked with __cuda_callable__) which are defined only on the
// subclasses, but are not virtual methods of Sparse.
template< typename Matrix1, typename Matrix2 >
void copySparseMatrix( Matrix1& A, const Matrix2& B );
} // namespace Matrices
} // namespace TNL
#include <TNL/Matrices/Sparse_impl.h>
#include <TNL/Matrices/SparseOperations.h>
/***************************************************************************
SparseOperations.h - description
-------------------
begin : Oct 4, 2018
copyright : (C) 2018 by Tomas Oberhuber et al.
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
// Implemented by: Jakub Klinkovský
// Note that these functions cannot be methods of the Sparse class, because
// their implementation uses methods (marked with __cuda_callable__) which are
// defined only on the subclasses, but are not virtual methods of Sparse.
#pragma once
namespace TNL {
namespace Matrices {
template< typename Matrix1, typename Matrix2 >
void copySparseMatrix( Matrix1& A, const Matrix2& B );
} // namespace Matrices
} // namespace TNL
#include "SparseOperations_impl.h"
/***************************************************************************
SparseOperations_impl.h - description
-------------------
begin : Oct 4, 2018
copyright : (C) 2018 by Tomas Oberhuber et al.
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
// Implemented by: Jakub Klinkovský
#pragma once
#include <TNL/Pointers/DevicePointer.h>
namespace TNL {
namespace Matrices {
#ifdef HAVE_CUDA
template< typename Vector, typename Matrix >
__global__ void
SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
const Matrix* matrix,
typename Matrix::IndexType rows,
typename Matrix::IndexType cols )
{
using IndexType = typename Matrix::IndexType;
IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
const IndexType gridSize = blockDim.x * gridDim.x;
while( rowIdx < rows ) {
const auto max_length = matrix->getRowLengthFast( rowIdx );
const auto row = matrix->getRow( rowIdx );
IndexType length = 0;
for( IndexType c_j = 0; c_j < max_length; c_j++ )
if( row.getElementColumn( c_j ) < cols )
length++;
else
break;
rowLengths[ rowIdx ] = length;
rowIdx += gridSize;
}
}
template< typename Matrix1, typename Matrix2 >
__global__ void
SparseMatrixCopyKernel( Matrix1* A,
const Matrix2* B,
const typename Matrix2::IndexType* rowLengths,
typename Matrix2::IndexType rows )
{
using IndexType = typename Matrix2::IndexType;
IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
const IndexType gridSize = blockDim.x * gridDim.x;
while( rowIdx < rows ) {
const auto length = rowLengths[ rowIdx ];
const auto rowB = B->getRow( rowIdx );
auto rowA = A->getRow( rowIdx );
for( IndexType c = 0; c < length; c++ )
rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
rowIdx += gridSize;
}
}
#endif
template< typename Matrix1, typename Matrix2 >
void
copySparseMatrix( Matrix1& A, const Matrix2& B )
{
static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value,
"The matrices must have the same RealType." );
static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value,
"The matrices must be allocated on the same device." );
static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value,
"The matrices must have the same IndexType." );
using RealType = typename Matrix1::RealType;
using DeviceType = typename Matrix1::DeviceType;
using IndexType = typename Matrix1::IndexType;
const IndexType rows = B.getRows();
const IndexType cols = B.getColumns();
A.setDimensions( rows, cols );
if( std::is_same< DeviceType, Devices::Host >::value ) {
// set row lengths
typename Matrix1::CompressedRowLengthsVector rowLengths;
rowLengths.setSize( rows );
#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
for( IndexType i = 0; i < rows; i++ ) {
const auto max_length = B.getRowLength( i );
const auto row = B.getRow( i );
IndexType length = 0;
for( IndexType c_j = 0; c_j < max_length; c_j++ )
if( row.getElementColumn( c_j ) < cols )
length++;
else
break;
rowLengths[ i ] = length;
}
A.setCompressedRowLengths( rowLengths );
#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
for( IndexType i = 0; i < rows; i++ ) {
const auto length = rowLengths[ i ];
const auto rowB = B.getRow( i );
auto rowA = A.getRow( i );
for( IndexType c = 0; c < length; c++ )
rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
}
}
if( std::is_same< DeviceType, Devices::Cuda >::value ) {
#ifdef HAVE_CUDA
dim3 blockSize( 256 );
dim3 gridSize;
const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );
typename Matrix1::CompressedRowLengthsVector rowLengths;
rowLengths.setSize( rows );
Pointers::DevicePointer< Matrix1 > Apointer( A );
const Pointers::DevicePointer< const Matrix2 > Bpointer( B );
// set row lengths
Devices::Cuda::synchronizeDevice();
SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
rowLengths.getData(),
&Bpointer.template getData< TNL::Devices::Cuda >(),
rows,
cols );
TNL_CHECK_CUDA_DEVICE;
Apointer->setCompressedRowLengths( rowLengths );
// copy rows
Devices::Cuda::synchronizeDevice();
SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
&Apointer.template modifyData< TNL::Devices::Cuda >(),
&Bpointer.template getData< TNL::Devices::Cuda >(),
rowLengths.getData(),
rows );
TNL_CHECK_CUDA_DEVICE;
#else
throw Exceptions::CudaSupportMissing();
#endif
}
}
} // namespace Matrices
} // namespace TNL
......@@ -11,7 +11,6 @@
#pragma once
#include "Sparse.h"
#include <TNL/Pointers/DevicePointer.h>
namespace TNL {
namespace Matrices {
......@@ -134,145 +133,5 @@ void Sparse< Real, Device, Index >::printStructure( std::ostream& str ) const
TNL_ASSERT_TRUE( false, "Not implemented yet." );
}
#ifdef HAVE_CUDA
template< typename Vector, typename Matrix >
__global__ void
SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
const Matrix* matrix,
typename Matrix::IndexType rows,
typename Matrix::IndexType cols )
{
using IndexType = typename Matrix::IndexType;
IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
const IndexType gridSize = blockDim.x * gridDim.x;
while( rowIdx < rows ) {
const auto max_length = matrix->getRowLengthFast( rowIdx );
const auto row = matrix->getRow( rowIdx );
IndexType length = 0;
for( IndexType c_j = 0; c_j < max_length; c_j++ )
if( row.getElementColumn( c_j ) < cols )
length++;
else
break;
rowLengths[ rowIdx ] = length;
rowIdx += gridSize;
}
}
template< typename Matrix1, typename Matrix2 >
__global__ void
SparseMatrixCopyKernel( Matrix1* A,
const Matrix2* B,
const typename Matrix2::IndexType* rowLengths,
typename Matrix2::IndexType rows )
{
using IndexType = typename Matrix2::IndexType;
IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
const IndexType gridSize = blockDim.x * gridDim.x;
while( rowIdx < rows ) {
const auto length = rowLengths[ rowIdx ];
const auto rowB = B->getRow( rowIdx );
auto rowA = A->getRow( rowIdx );
for( IndexType c = 0; c < length; c++ )
rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
rowIdx += gridSize;
}
}
#endif
template< typename Matrix1, typename Matrix2 >
void
copySparseMatrix( Matrix1& A, const Matrix2& B )
{
static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value,
"The matrices must have the same RealType." );
static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value,
"The matrices must be allocated on the same device." );
static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value,
"The matrices must have the same IndexType." );
using RealType = typename Matrix1::RealType;
using DeviceType = typename Matrix1::DeviceType;
using IndexType = typename Matrix1::IndexType;
const IndexType rows = B.getRows();
const IndexType cols = B.getColumns();
A.setDimensions( rows, cols );
if( std::is_same< DeviceType, Devices::Host >::value ) {
// set row lengths
typename Matrix1::CompressedRowLengthsVector rowLengths;
rowLengths.setSize( rows );
#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
for( IndexType i = 0; i < rows; i++ ) {
const auto max_length = B.getRowLength( i );
const auto row = B.getRow( i );
IndexType length = 0;
for( IndexType c_j = 0; c_j < max_length; c_j++ )
if( row.getElementColumn( c_j ) < cols )
length++;
else
break;
rowLengths[ i ] = length;
}
A.setCompressedRowLengths( rowLengths );
#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
for( IndexType i = 0; i < rows; i++ ) {
const auto length = rowLengths[ i ];
const auto rowB = B.getRow( i );
auto rowA = A.getRow( i );
for( IndexType c = 0; c < length; c++ )
rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
}
}
if( std::is_same< DeviceType, Devices::Cuda >::value ) {
#ifdef HAVE_CUDA
dim3 blockSize( 256 );
dim3 gridSize;
const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );
typename Matrix1::CompressedRowLengthsVector rowLengths;
rowLengths.setSize( rows );
Pointers::DevicePointer< Matrix1 > Apointer( A );
const Pointers::DevicePointer< const Matrix2 > Bpointer( B );
// set row lengths
Devices::Cuda::synchronizeDevice();
SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
rowLengths.getData(),
&Bpointer.template getData< TNL::Devices::Cuda >(),
rows,
cols );
TNL_CHECK_CUDA_DEVICE;
Apointer->setCompressedRowLengths( rowLengths );
// copy rows
Devices::Cuda::synchronizeDevice();
SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
&Apointer.template modifyData< TNL::Devices::Cuda >(),
&Bpointer.template getData< TNL::Devices::Cuda >(),
rowLengths.getData(),
rows );
TNL_CHECK_CUDA_DEVICE;
#else
throw Exceptions::CudaSupportMissing();
#endif
}
}
} // namespace Matrices
} // namespace TNL
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment