Loading src/Benchmarks/DistSpMV/ordering.h +0 −112 Original line number Diff line number Diff line #pragma once #include <algorithm> #include <TNL/Devices/Host.h> #include <TNL/ParallelFor.h> using namespace TNL; template< typename Matrix, typename PermutationVector > void getTrivialOrdering( const Matrix& matrix, PermutationVector& perm, PermutationVector& iperm ) Loading @@ -26,108 +19,3 @@ getTrivialOrdering( const Matrix& matrix, PermutationVector& perm, PermutationVe } } template< typename Vector, typename PermutationVector > void reorderVector( const Vector& src, Vector& dest, const PermutationVector& perm ) { TNL_ASSERT_EQ( src.getSize(), perm.getSize(), "Source vector and permutation must have the same size." ); using RealType = typename Vector::RealType; using DeviceType = typename Vector::DeviceType; using IndexType = typename Vector::IndexType; auto kernel = [] __cuda_callable__ ( IndexType i, const RealType* src, RealType* dest, const typename PermutationVector::RealType* perm ) { dest[ i ] = src[ perm[ i ] ]; }; dest.setLike( src ); ParallelFor< DeviceType >::exec( (IndexType) 0, src.getSize(), kernel, src.getData(), dest.getData(), perm.getData() ); } template< typename Matrix, typename PermutationVector > void reorderMatrix( const Matrix& matrix1, Matrix& matrix2, const PermutationVector& _perm, const PermutationVector& _iperm ) { // TODO: implement on GPU static_assert( std::is_same< typename Matrix::DeviceType, Devices::Host >::value, "matrix reordering is implemented only for host" ); static_assert( std::is_same< typename PermutationVector::DeviceType, Devices::Host >::value, "matrix reordering is implemented only for host" ); using namespace TNL; using IndexType = typename Matrix::IndexType; matrix2.setLike( matrix1 ); // general multidimensional accessors for permutation indices // TODO: this depends on the specific layout of dofs, general reordering of NDArray is needed auto perm = [&]( IndexType dof ) { TNL_ASSERT_LT( dof, matrix1.getRows(), "invalid dof index" ); const IndexType i = dof / _perm.getSize(); return i * _perm.getSize() + _perm[ dof % _perm.getSize() ]; }; auto iperm = [&]( IndexType dof ) { TNL_ASSERT_LT( dof, matrix1.getRows(), "invalid dof index" ); const IndexType i = dof / _iperm.getSize(); return i * _iperm.getSize() + _iperm[ dof % _iperm.getSize() ]; }; // set row lengths typename Matrix::CompressedRowLengthsVector rowLengths; rowLengths.setSize( matrix1.getRows() ); for( IndexType i = 0; i < matrix1.getRows(); i++ ) { const IndexType maxLength = matrix1.getRowLength( perm( i ) ); const auto row = matrix1.getRow( perm( i ) ); IndexType length = 0; for( IndexType j = 0; j < maxLength; j++ ) if( row.getElementColumn( j ) < matrix1.getColumns() ) length++; rowLengths[ i ] = length; } matrix2.setCompressedRowLengths( rowLengths ); // set row elements for( IndexType i = 0; i < matrix2.getRows(); i++ ) { const IndexType rowLength = rowLengths[ i ]; // extract sparse row const auto row1 = matrix1.getRow( perm( i ) ); // permute typename Matrix::IndexType columns[ rowLength ]; typename Matrix::RealType values[ rowLength ]; for( IndexType j = 0; j < rowLength; j++ ) { columns[ j ] = iperm( row1.getElementColumn( j ) ); values[ j ] = row1.getElementValue( j ); } // sort IndexType indices[ rowLength ]; for( IndexType j = 0; j < rowLength; j++ ) indices[ j ] = j; // nvcc does not allow lambdas to capture VLAs, even in host code (WTF!?) // error: a variable captured by a lambda cannot have a type involving a variable-length array IndexType* _columns = columns; auto comparator = [=]( IndexType a, IndexType b ) { return _columns[ a ] < _columns[ b ]; }; std::sort( indices, indices + rowLength, comparator ); typename Matrix::IndexType sortedColumns[ rowLength ]; typename Matrix::RealType sortedValues[ rowLength ]; for( IndexType j = 0; j < rowLength; j++ ) { sortedColumns[ j ] = columns[ indices[ j ] ]; sortedValues[ j ] = values[ indices[ j ] ]; } matrix2.setRow( i, sortedColumns, sortedValues, rowLength ); } } src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h +1 −1 Original line number Diff line number Diff line Loading @@ -190,7 +190,7 @@ struct SpmvBenchmark PermutationVector perm, iperm; getTrivialOrdering( matrix, perm, iperm ); MatrixType matrix_perm; reorderMatrix( matrix, matrix_perm, perm, iperm ); Matrices::reorderSparseMatrix( matrix, matrix_perm, perm, iperm ); if( CommunicatorType::isDistributed() ) runDistributed( benchmark, metadata, parameters, matrix_perm, vector ); else Loading src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h +3 −3 Original line number Diff line number Diff line Loading @@ -361,9 +361,9 @@ struct LinearSolversBenchmark getTrivialOrdering( *matrixPointer, perm, iperm ); SharedPointer< MatrixType > matrix_perm; VectorType x0_perm, b_perm; reorderMatrix( *matrixPointer, *matrix_perm, perm, iperm ); reorderVector( x0, x0_perm, perm ); reorderVector( b, b_perm, perm ); Matrices::reorderSparseMatrix( *matrixPointer, *matrix_perm, perm, iperm ); Matrices::reorderVector( x0, x0_perm, perm ); Matrices::reorderVector( b, b_perm, perm ); if( CommunicatorType::isDistributed() ) runDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm ); else Loading src/TNL/Matrices/SparseOperations.h +14 −0 Original line number Diff line number Diff line Loading @@ -31,6 +31,20 @@ copyAdjacencyStructure( const Matrix& A, AdjacencyMatrix& B, bool has_symmetric_pattern = false, bool ignore_diagonal = true ); // Applies a permutation to the rows of a sparse matrix and its inverse // permutation to the columns of the matrix, i.e. A_perm = P*A*P^{-1}, where // P is the permutation matrix represented by the perm vector and P^{-1} is the // inverse permutation represented by the iperm vector. template< typename Matrix1, typename Matrix2, typename PermutationVector > void reorderSparseMatrix( const Matrix1& A, Matrix2& A_perm, const PermutationVector& perm, const PermutationVector& iperm ); // TODO: the method does not belong here, but there is no better place... template< typename Vector, typename PermutationVector > void reorderVector( const Vector& src, Vector& dest, const PermutationVector& perm ); } // namespace Matrices } // namespace TNL Loading src/TNL/Matrices/SparseOperations_impl.h +96 −0 Original line number Diff line number Diff line Loading @@ -14,8 +14,10 @@ #include <type_traits> #include <stdexcept> #include <algorithm> #include <TNL/Pointers/DevicePointer.h> #include <TNL/ParallelFor.h> namespace TNL { namespace Matrices { Loading Loading @@ -260,5 +262,99 @@ copyAdjacencyStructure( const Matrix& A, AdjacencyMatrix& B, } } template< typename Matrix1, typename Matrix2, typename PermutationVector > void reorderSparseMatrix( const Matrix1& matrix1, Matrix2& matrix2, const PermutationVector& perm, const PermutationVector& iperm ) { // TODO: implement on GPU static_assert( std::is_same< typename Matrix1::DeviceType, Devices::Host >::value, "matrix reordering is implemented only for host" ); static_assert( std::is_same< typename Matrix2::DeviceType, Devices::Host >::value, "matrix reordering is implemented only for host" ); static_assert( std::is_same< typename PermutationVector::DeviceType, Devices::Host >::value, "matrix reordering is implemented only for host" ); using IndexType = typename Matrix1::IndexType; matrix2.setDimensions( matrix1.getRows(), matrix1.getColumns() ); // set row lengths typename Matrix2::CompressedRowLengthsVector rowLengths; rowLengths.setSize( matrix1.getRows() ); for( IndexType i = 0; i < matrix1.getRows(); i++ ) { const IndexType maxLength = matrix1.getRowLength( perm[ i ] ); const auto row = matrix1.getRow( perm[ i ] ); IndexType length = 0; for( IndexType j = 0; j < maxLength; j++ ) if( row.getElementColumn( j ) < matrix1.getColumns() ) length++; rowLengths[ i ] = length; } matrix2.setCompressedRowLengths( rowLengths ); // set row elements for( IndexType i = 0; i < matrix2.getRows(); i++ ) { const IndexType rowLength = rowLengths[ i ]; // extract sparse row const auto row1 = matrix1.getRow( perm[ i ] ); // permute typename Matrix2::IndexType columns[ rowLength ]; typename Matrix2::RealType values[ rowLength ]; for( IndexType j = 0; j < rowLength; j++ ) { columns[ j ] = iperm[ row1.getElementColumn( j ) ]; values[ j ] = row1.getElementValue( j ); } // sort IndexType indices[ rowLength ]; for( IndexType j = 0; j < rowLength; j++ ) indices[ j ] = j; // nvcc does not allow lambdas to capture VLAs, even in host code (WTF!?) // error: a variable captured by a lambda cannot have a type involving a variable-length array IndexType* _columns = columns; auto comparator = [=]( IndexType a, IndexType b ) { return _columns[ a ] < _columns[ b ]; }; std::sort( indices, indices + rowLength, comparator ); typename Matrix2::IndexType sortedColumns[ rowLength ]; typename Matrix2::RealType sortedValues[ rowLength ]; for( IndexType j = 0; j < rowLength; j++ ) { sortedColumns[ j ] = columns[ indices[ j ] ]; sortedValues[ j ] = values[ indices[ j ] ]; } matrix2.setRow( i, sortedColumns, sortedValues, rowLength ); } } template< typename Vector, typename PermutationVector > void reorderVector( const Vector& src, Vector& dest, const PermutationVector& perm ) { TNL_ASSERT_EQ( src.getSize(), perm.getSize(), "Source vector and permutation must have the same size." ); using RealType = typename Vector::RealType; using DeviceType = typename Vector::DeviceType; using IndexType = typename Vector::IndexType; auto kernel = [] __cuda_callable__ ( IndexType i, const RealType* src, RealType* dest, const typename PermutationVector::RealType* perm ) { dest[ i ] = src[ perm[ i ] ]; }; dest.setLike( src ); ParallelFor< DeviceType >::exec( (IndexType) 0, src.getSize(), kernel, src.getData(), dest.getData(), perm.getData() ); } } // namespace Matrices } // namespace TNL Loading
src/Benchmarks/DistSpMV/ordering.h +0 −112 Original line number Diff line number Diff line #pragma once #include <algorithm> #include <TNL/Devices/Host.h> #include <TNL/ParallelFor.h> using namespace TNL; template< typename Matrix, typename PermutationVector > void getTrivialOrdering( const Matrix& matrix, PermutationVector& perm, PermutationVector& iperm ) Loading @@ -26,108 +19,3 @@ getTrivialOrdering( const Matrix& matrix, PermutationVector& perm, PermutationVe } } template< typename Vector, typename PermutationVector > void reorderVector( const Vector& src, Vector& dest, const PermutationVector& perm ) { TNL_ASSERT_EQ( src.getSize(), perm.getSize(), "Source vector and permutation must have the same size." ); using RealType = typename Vector::RealType; using DeviceType = typename Vector::DeviceType; using IndexType = typename Vector::IndexType; auto kernel = [] __cuda_callable__ ( IndexType i, const RealType* src, RealType* dest, const typename PermutationVector::RealType* perm ) { dest[ i ] = src[ perm[ i ] ]; }; dest.setLike( src ); ParallelFor< DeviceType >::exec( (IndexType) 0, src.getSize(), kernel, src.getData(), dest.getData(), perm.getData() ); } template< typename Matrix, typename PermutationVector > void reorderMatrix( const Matrix& matrix1, Matrix& matrix2, const PermutationVector& _perm, const PermutationVector& _iperm ) { // TODO: implement on GPU static_assert( std::is_same< typename Matrix::DeviceType, Devices::Host >::value, "matrix reordering is implemented only for host" ); static_assert( std::is_same< typename PermutationVector::DeviceType, Devices::Host >::value, "matrix reordering is implemented only for host" ); using namespace TNL; using IndexType = typename Matrix::IndexType; matrix2.setLike( matrix1 ); // general multidimensional accessors for permutation indices // TODO: this depends on the specific layout of dofs, general reordering of NDArray is needed auto perm = [&]( IndexType dof ) { TNL_ASSERT_LT( dof, matrix1.getRows(), "invalid dof index" ); const IndexType i = dof / _perm.getSize(); return i * _perm.getSize() + _perm[ dof % _perm.getSize() ]; }; auto iperm = [&]( IndexType dof ) { TNL_ASSERT_LT( dof, matrix1.getRows(), "invalid dof index" ); const IndexType i = dof / _iperm.getSize(); return i * _iperm.getSize() + _iperm[ dof % _iperm.getSize() ]; }; // set row lengths typename Matrix::CompressedRowLengthsVector rowLengths; rowLengths.setSize( matrix1.getRows() ); for( IndexType i = 0; i < matrix1.getRows(); i++ ) { const IndexType maxLength = matrix1.getRowLength( perm( i ) ); const auto row = matrix1.getRow( perm( i ) ); IndexType length = 0; for( IndexType j = 0; j < maxLength; j++ ) if( row.getElementColumn( j ) < matrix1.getColumns() ) length++; rowLengths[ i ] = length; } matrix2.setCompressedRowLengths( rowLengths ); // set row elements for( IndexType i = 0; i < matrix2.getRows(); i++ ) { const IndexType rowLength = rowLengths[ i ]; // extract sparse row const auto row1 = matrix1.getRow( perm( i ) ); // permute typename Matrix::IndexType columns[ rowLength ]; typename Matrix::RealType values[ rowLength ]; for( IndexType j = 0; j < rowLength; j++ ) { columns[ j ] = iperm( row1.getElementColumn( j ) ); values[ j ] = row1.getElementValue( j ); } // sort IndexType indices[ rowLength ]; for( IndexType j = 0; j < rowLength; j++ ) indices[ j ] = j; // nvcc does not allow lambdas to capture VLAs, even in host code (WTF!?) // error: a variable captured by a lambda cannot have a type involving a variable-length array IndexType* _columns = columns; auto comparator = [=]( IndexType a, IndexType b ) { return _columns[ a ] < _columns[ b ]; }; std::sort( indices, indices + rowLength, comparator ); typename Matrix::IndexType sortedColumns[ rowLength ]; typename Matrix::RealType sortedValues[ rowLength ]; for( IndexType j = 0; j < rowLength; j++ ) { sortedColumns[ j ] = columns[ indices[ j ] ]; sortedValues[ j ] = values[ indices[ j ] ]; } matrix2.setRow( i, sortedColumns, sortedValues, rowLength ); } }
src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h +1 −1 Original line number Diff line number Diff line Loading @@ -190,7 +190,7 @@ struct SpmvBenchmark PermutationVector perm, iperm; getTrivialOrdering( matrix, perm, iperm ); MatrixType matrix_perm; reorderMatrix( matrix, matrix_perm, perm, iperm ); Matrices::reorderSparseMatrix( matrix, matrix_perm, perm, iperm ); if( CommunicatorType::isDistributed() ) runDistributed( benchmark, metadata, parameters, matrix_perm, vector ); else Loading
src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h +3 −3 Original line number Diff line number Diff line Loading @@ -361,9 +361,9 @@ struct LinearSolversBenchmark getTrivialOrdering( *matrixPointer, perm, iperm ); SharedPointer< MatrixType > matrix_perm; VectorType x0_perm, b_perm; reorderMatrix( *matrixPointer, *matrix_perm, perm, iperm ); reorderVector( x0, x0_perm, perm ); reorderVector( b, b_perm, perm ); Matrices::reorderSparseMatrix( *matrixPointer, *matrix_perm, perm, iperm ); Matrices::reorderVector( x0, x0_perm, perm ); Matrices::reorderVector( b, b_perm, perm ); if( CommunicatorType::isDistributed() ) runDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm ); else Loading
src/TNL/Matrices/SparseOperations.h +14 −0 Original line number Diff line number Diff line Loading @@ -31,6 +31,20 @@ copyAdjacencyStructure( const Matrix& A, AdjacencyMatrix& B, bool has_symmetric_pattern = false, bool ignore_diagonal = true ); // Applies a permutation to the rows of a sparse matrix and its inverse // permutation to the columns of the matrix, i.e. A_perm = P*A*P^{-1}, where // P is the permutation matrix represented by the perm vector and P^{-1} is the // inverse permutation represented by the iperm vector. template< typename Matrix1, typename Matrix2, typename PermutationVector > void reorderSparseMatrix( const Matrix1& A, Matrix2& A_perm, const PermutationVector& perm, const PermutationVector& iperm ); // TODO: the method does not belong here, but there is no better place... template< typename Vector, typename PermutationVector > void reorderVector( const Vector& src, Vector& dest, const PermutationVector& perm ); } // namespace Matrices } // namespace TNL Loading
src/TNL/Matrices/SparseOperations_impl.h +96 −0 Original line number Diff line number Diff line Loading @@ -14,8 +14,10 @@ #include <type_traits> #include <stdexcept> #include <algorithm> #include <TNL/Pointers/DevicePointer.h> #include <TNL/ParallelFor.h> namespace TNL { namespace Matrices { Loading Loading @@ -260,5 +262,99 @@ copyAdjacencyStructure( const Matrix& A, AdjacencyMatrix& B, } } template< typename Matrix1, typename Matrix2, typename PermutationVector > void reorderSparseMatrix( const Matrix1& matrix1, Matrix2& matrix2, const PermutationVector& perm, const PermutationVector& iperm ) { // TODO: implement on GPU static_assert( std::is_same< typename Matrix1::DeviceType, Devices::Host >::value, "matrix reordering is implemented only for host" ); static_assert( std::is_same< typename Matrix2::DeviceType, Devices::Host >::value, "matrix reordering is implemented only for host" ); static_assert( std::is_same< typename PermutationVector::DeviceType, Devices::Host >::value, "matrix reordering is implemented only for host" ); using IndexType = typename Matrix1::IndexType; matrix2.setDimensions( matrix1.getRows(), matrix1.getColumns() ); // set row lengths typename Matrix2::CompressedRowLengthsVector rowLengths; rowLengths.setSize( matrix1.getRows() ); for( IndexType i = 0; i < matrix1.getRows(); i++ ) { const IndexType maxLength = matrix1.getRowLength( perm[ i ] ); const auto row = matrix1.getRow( perm[ i ] ); IndexType length = 0; for( IndexType j = 0; j < maxLength; j++ ) if( row.getElementColumn( j ) < matrix1.getColumns() ) length++; rowLengths[ i ] = length; } matrix2.setCompressedRowLengths( rowLengths ); // set row elements for( IndexType i = 0; i < matrix2.getRows(); i++ ) { const IndexType rowLength = rowLengths[ i ]; // extract sparse row const auto row1 = matrix1.getRow( perm[ i ] ); // permute typename Matrix2::IndexType columns[ rowLength ]; typename Matrix2::RealType values[ rowLength ]; for( IndexType j = 0; j < rowLength; j++ ) { columns[ j ] = iperm[ row1.getElementColumn( j ) ]; values[ j ] = row1.getElementValue( j ); } // sort IndexType indices[ rowLength ]; for( IndexType j = 0; j < rowLength; j++ ) indices[ j ] = j; // nvcc does not allow lambdas to capture VLAs, even in host code (WTF!?) // error: a variable captured by a lambda cannot have a type involving a variable-length array IndexType* _columns = columns; auto comparator = [=]( IndexType a, IndexType b ) { return _columns[ a ] < _columns[ b ]; }; std::sort( indices, indices + rowLength, comparator ); typename Matrix2::IndexType sortedColumns[ rowLength ]; typename Matrix2::RealType sortedValues[ rowLength ]; for( IndexType j = 0; j < rowLength; j++ ) { sortedColumns[ j ] = columns[ indices[ j ] ]; sortedValues[ j ] = values[ indices[ j ] ]; } matrix2.setRow( i, sortedColumns, sortedValues, rowLength ); } } template< typename Vector, typename PermutationVector > void reorderVector( const Vector& src, Vector& dest, const PermutationVector& perm ) { TNL_ASSERT_EQ( src.getSize(), perm.getSize(), "Source vector and permutation must have the same size." ); using RealType = typename Vector::RealType; using DeviceType = typename Vector::DeviceType; using IndexType = typename Vector::IndexType; auto kernel = [] __cuda_callable__ ( IndexType i, const RealType* src, RealType* dest, const typename PermutationVector::RealType* perm ) { dest[ i ] = src[ perm[ i ] ]; }; dest.setLike( src ); ParallelFor< DeviceType >::exec( (IndexType) 0, src.getSize(), kernel, src.getData(), dest.getData(), perm.getData() ); } } // namespace Matrices } // namespace TNL