Loading src/TNL/Matrices/Matrix.h +1 −8 Original line number Diff line number Diff line Loading @@ -92,18 +92,11 @@ public: [[deprecated]] void computeColorsVector(Containers::Vector<Index, Device, Index> &colorsVector); // TODO: copy should be done in the operator= and it should work the other way too void copyFromHostToCuda( Matrices::Matrix< Real, Devices::Host, Index >& matrix ); // TODO: missing implementation! //__cuda_callable__ //Index getValuesSize() const; protected: IndexType rows, columns; // TODO: remove1 // TODO: remove IndexType numberOfColors; ValuesVectorType values; Loading src/TNL/Matrices/Matrix.hpp +0 −15 Original line number Diff line number Diff line Loading @@ -250,21 +250,6 @@ computeColorsVector(Containers::Vector<Index, Device, Index> &colorsVector) } } template< typename Real, typename Device, typename Index, typename RealAllocator > void Matrix< Real, Device, Index, RealAllocator >:: copyFromHostToCuda( Matrix< Real, Devices::Host, Index >& matrix ) { this->numberOfColors = matrix.getNumberOfColors(); this->columns = matrix.getColumns(); this->rows = matrix.getRows(); this->values.setSize( matrix.getValuesSize() ); } #ifdef HAVE_CUDA template< typename Matrix, typename InVector, Loading src/TNL/Matrices/MatrixView.h +0 −35 Original line number Diff line number Diff line Loading @@ -49,14 +49,6 @@ public: __cuda_callable__ MatrixView( const MatrixView& view ) = default; virtual IndexType getRowLength( const IndexType row ) const = 0; // TODO: implementation is not parallel // TODO: it would be nice if padding zeros could be stripped void getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const; virtual void getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const; IndexType getAllocatedElementsCount() const; virtual IndexType getNumberOfNonzeroMatrixElements() const; Loading @@ -67,24 +59,6 @@ public: __cuda_callable__ IndexType getColumns() const; /**** * TODO: The fast variants of the following methods cannot be virtual. * If they were, they could not be used in the CUDA kernels. If CUDA allows it * in the future and it does not slow down, declare them as virtual here. */ virtual void setElement( const IndexType row, const IndexType column, const RealType& value ) = 0; virtual void addElement( const IndexType row, const IndexType column, const RealType& value, const RealType& thisElementMultiplicator = 1.0 ) = 0; virtual Real getElement( const IndexType row, const IndexType column ) const = 0; __cuda_callable__ const ValuesView& getValues() const; Loading Loading @@ -135,15 +109,6 @@ std::ostream& operator << ( std::ostream& str, const MatrixView< Real, Device, I return str; } /* template< typename Matrix, typename InVector, typename OutVector > void MatrixVectorProductCuda( const Matrix& matrix, const InVector& inVector, OutVector& outVector ); */ } // namespace Matrices } // namespace TNL Loading src/TNL/Matrices/MatrixView.hpp +0 −74 Original line number Diff line number Diff line Loading @@ -42,29 +42,6 @@ MatrixView( const IndexType rows_, { } template< typename Real, typename Device, typename Index > void MatrixView< Real, Device, Index >:: getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const { rowLengths.setSize( this->getRows() ); getCompressedRowLengths( rowLengths.getView() ); } template< typename Real, typename Device, typename Index > void MatrixView< Real, Device, Index >:: getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const { TNL_ASSERT_EQ( rowLengths.getSize(), this->getRows(), "invalid size of the rowLengths vector" ); for( IndexType row = 0; row < this->getRows(); row++ ) rowLengths.setElement( row, this->getRowLength( row ) ); } template< typename Real, typename Device, typename Index > Loading Loading @@ -244,56 +221,5 @@ computeColorsVector(Containers::Vector<Index, Device, Index> &colorsVector) } } /* #ifdef HAVE_CUDA template< typename Matrix, typename InVector, typename OutVector > __global__ void MatrixVectorProductCudaKernel( const Matrix* matrix, const InVector* inVector, OutVector* outVector, int gridIdx ) { static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } #endif template< typename Matrix, typename InVector, typename OutVector > void MatrixVectorProductCuda( const Matrix& matrix, const InVector& inVector, OutVector& outVector ) { #ifdef HAVE_CUDA typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Cuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); MatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); TNL_CHECK_CUDA_DEVICE; } Cuda::freeFromDevice( kernel_this ); Cuda::freeFromDevice( kernel_inVector ); Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; #endif } */ } // namespace Matrices } // namespace TNL Loading
src/TNL/Matrices/Matrix.h +1 −8 Original line number Diff line number Diff line Loading @@ -92,18 +92,11 @@ public: [[deprecated]] void computeColorsVector(Containers::Vector<Index, Device, Index> &colorsVector); // TODO: copy should be done in the operator= and it should work the other way too void copyFromHostToCuda( Matrices::Matrix< Real, Devices::Host, Index >& matrix ); // TODO: missing implementation! //__cuda_callable__ //Index getValuesSize() const; protected: IndexType rows, columns; // TODO: remove1 // TODO: remove IndexType numberOfColors; ValuesVectorType values; Loading
src/TNL/Matrices/Matrix.hpp +0 −15 Original line number Diff line number Diff line Loading @@ -250,21 +250,6 @@ computeColorsVector(Containers::Vector<Index, Device, Index> &colorsVector) } } template< typename Real, typename Device, typename Index, typename RealAllocator > void Matrix< Real, Device, Index, RealAllocator >:: copyFromHostToCuda( Matrix< Real, Devices::Host, Index >& matrix ) { this->numberOfColors = matrix.getNumberOfColors(); this->columns = matrix.getColumns(); this->rows = matrix.getRows(); this->values.setSize( matrix.getValuesSize() ); } #ifdef HAVE_CUDA template< typename Matrix, typename InVector, Loading
src/TNL/Matrices/MatrixView.h +0 −35 Original line number Diff line number Diff line Loading @@ -49,14 +49,6 @@ public: __cuda_callable__ MatrixView( const MatrixView& view ) = default; virtual IndexType getRowLength( const IndexType row ) const = 0; // TODO: implementation is not parallel // TODO: it would be nice if padding zeros could be stripped void getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const; virtual void getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const; IndexType getAllocatedElementsCount() const; virtual IndexType getNumberOfNonzeroMatrixElements() const; Loading @@ -67,24 +59,6 @@ public: __cuda_callable__ IndexType getColumns() const; /**** * TODO: The fast variants of the following methods cannot be virtual. * If they were, they could not be used in the CUDA kernels. If CUDA allows it * in the future and it does not slow down, declare them as virtual here. */ virtual void setElement( const IndexType row, const IndexType column, const RealType& value ) = 0; virtual void addElement( const IndexType row, const IndexType column, const RealType& value, const RealType& thisElementMultiplicator = 1.0 ) = 0; virtual Real getElement( const IndexType row, const IndexType column ) const = 0; __cuda_callable__ const ValuesView& getValues() const; Loading Loading @@ -135,15 +109,6 @@ std::ostream& operator << ( std::ostream& str, const MatrixView< Real, Device, I return str; } /* template< typename Matrix, typename InVector, typename OutVector > void MatrixVectorProductCuda( const Matrix& matrix, const InVector& inVector, OutVector& outVector ); */ } // namespace Matrices } // namespace TNL Loading
src/TNL/Matrices/MatrixView.hpp +0 −74 Original line number Diff line number Diff line Loading @@ -42,29 +42,6 @@ MatrixView( const IndexType rows_, { } template< typename Real, typename Device, typename Index > void MatrixView< Real, Device, Index >:: getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const { rowLengths.setSize( this->getRows() ); getCompressedRowLengths( rowLengths.getView() ); } template< typename Real, typename Device, typename Index > void MatrixView< Real, Device, Index >:: getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const { TNL_ASSERT_EQ( rowLengths.getSize(), this->getRows(), "invalid size of the rowLengths vector" ); for( IndexType row = 0; row < this->getRows(); row++ ) rowLengths.setElement( row, this->getRowLength( row ) ); } template< typename Real, typename Device, typename Index > Loading Loading @@ -244,56 +221,5 @@ computeColorsVector(Containers::Vector<Index, Device, Index> &colorsVector) } } /* #ifdef HAVE_CUDA template< typename Matrix, typename InVector, typename OutVector > __global__ void MatrixVectorProductCudaKernel( const Matrix* matrix, const InVector* inVector, OutVector* outVector, int gridIdx ) { static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } #endif template< typename Matrix, typename InVector, typename OutVector > void MatrixVectorProductCuda( const Matrix& matrix, const InVector& inVector, OutVector& outVector ) { #ifdef HAVE_CUDA typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Cuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); MatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); TNL_CHECK_CUDA_DEVICE; } Cuda::freeFromDevice( kernel_this ); Cuda::freeFromDevice( kernel_inVector ); Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; #endif } */ } // namespace Matrices } // namespace TNL