From f4144f9cef2d78557d845aa8937279cf778a8e21 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz> Date: Sun, 2 Sep 2018 15:21:49 +0200 Subject: [PATCH] Replacing __device__ __host__ with __cuda_callable__. --- src/TNL/Matrices/BiEllpack.h | 40 ++++--------- src/TNL/Matrices/BiEllpackSymmetric.h | 40 ++++--------- src/TNL/Matrices/BiEllpackSymmetric_impl.h | 28 +++------ src/TNL/Matrices/BiEllpack_impl.h | 32 +++------- src/TNL/Matrices/EllpackSymmetric.h | 34 +++-------- src/TNL/Matrices/EllpackSymmetricGraph.h | 70 ++++++---------------- 6 files changed, 64 insertions(+), 180 deletions(-) diff --git a/src/TNL/Matrices/BiEllpack.h b/src/TNL/Matrices/BiEllpack.h index 0ee7962fb6..8b0f54979e 100644 --- a/src/TNL/Matrices/BiEllpack.h +++ b/src/TNL/Matrices/BiEllpack.h @@ -66,9 +66,7 @@ public: const IndexType column, const RealType& value ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool setElementFast( const IndexType row, const IndexType column, const RealType& value ); @@ -78,9 +76,7 @@ public: const RealType& value, const RealType& thisElementMultiplicator = 1.0 ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool addElementFast( const IndexType row, const IndexType column, const RealType& value, @@ -100,9 +96,7 @@ public: RealType getElement( const IndexType row, const IndexType column ) const; -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ RealType getElementFast( const IndexType row, const IndexType column ) const; @@ -111,9 +105,7 @@ public: IndexType* columns, RealType* values ) const; -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ IndexType getGroupLength( const IndexType strip, const IndexType group ) const; @@ -129,9 +121,7 @@ public: void setVirtualRows(const IndexType rows); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ IndexType getNumberOfGroups( const IndexType row ) const; bool vectorProductTest() const; @@ -155,36 +145,26 @@ public: template< typename InVector, typename OutVector > -#ifdef HAVE_CUDA - __device__ -#endif + __cuda_callable__ void spmvCuda( const InVector& inVector, OutVector& outVector, /*const IndexType warpStart, const IndexType inWarpIdx*/ int globalIdx ) const; -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ IndexType getStripLength( const IndexType strip ) const; -#ifdef HAVE_CUDA - __device__ -#endif + __cuda_callable__ void performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, const IndexType strip ); -#ifdef HAVE_CUDA - __device__ -#endif + __cuda_callable__ void computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, const IndexType numberOfStrips, const IndexType strip ); -#ifdef HAVE_CUDA - __device__ -#endif + __cuda_callable__ IndexType power( const IndexType number, const IndexType exponent ) const; diff --git a/src/TNL/Matrices/BiEllpackSymmetric.h b/src/TNL/Matrices/BiEllpackSymmetric.h index 6de40d0e34..51d672ce13 100644 --- a/src/TNL/Matrices/BiEllpackSymmetric.h +++ b/src/TNL/Matrices/BiEllpackSymmetric.h @@ -56,9 +56,7 @@ public: const IndexType column, const RealType& value ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool setElementFast( const IndexType row, const IndexType column, const RealType& value ); @@ -68,9 +66,7 @@ public: const RealType& value, const RealType& thisElementMultiplicator = 1.0 ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool addElementFast( const IndexType row, const IndexType column, const RealType& value, @@ -90,9 +86,7 @@ public: RealType getElement( const IndexType row, const IndexType column ) const; -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ RealType getElementFast( const IndexType row, const IndexType column ) const; @@ -100,9 +94,7 @@ public: IndexType* columns, RealType* values ) const; -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ IndexType getGroupLength( const IndexType strip, const IndexType group ) const; @@ -118,9 +110,7 @@ public: void setVirtualRows(const IndexType rows); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ IndexType getNumberOfGroups( const IndexType row ) const; bool vectorProductTest() const; @@ -144,36 +134,26 @@ public: template< typename InVector, typename OutVector > -#ifdef HAVE_CUDA - __device__ -#endif + __cuda_callable__ void spmvCuda( const InVector& inVector, OutVector& outVector, /*const IndexType warpStart, const IndexType inWarpIdx*/ int globalIdx ) const; -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ IndexType getStripLength( const IndexType strip ) const; -#ifdef HAVE_CUDA - __device__ -#endif + __cuda_callable__ void performRowBubbleSortCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, const IndexType strip ); -#ifdef HAVE_CUDA - __device__ -#endif + __cuda_callable__ void computeColumnSizesCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, const IndexType numberOfStrips, const IndexType strip ); -#ifdef HAVE_CUDA - __device__ -#endif + __cuda_callable__ IndexType power( const IndexType number, const IndexType exponent ) const; diff --git a/src/TNL/Matrices/BiEllpackSymmetric_impl.h b/src/TNL/Matrices/BiEllpackSymmetric_impl.h index 374ad103d7..9a7f380eeb 100644 --- a/src/TNL/Matrices/BiEllpackSymmetric_impl.h +++ b/src/TNL/Matrices/BiEllpackSymmetric_impl.h @@ -22,9 +22,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif + __cuda_callable__ Index BiEllpackSymmetric< Real, Device, Index, StripSize >::power( const IndexType number, const IndexType exponent ) const { @@ -127,9 +125,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const { TNL_ASSERT( strip >= 0, @@ -144,9 +140,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const { TNL_ASSERT( row >=0 && row < this->getRows(), @@ -251,9 +245,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setElementFast( const IndexType row, const IndexType column, const RealType& value ) @@ -311,9 +303,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addElementFast( const IndexType row, const IndexType column, const RealType& value, @@ -488,9 +478,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ Real BiEllpackSymmetric< Real, Device, Index, StripSize >::getElementFast( const IndexType row, const IndexType column ) const { @@ -584,9 +572,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getGroupLength( const Index strip, const Index group ) const { diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h index d62ad35fd6..80b182db1e 100644 --- a/src/TNL/Matrices/BiEllpack_impl.h +++ b/src/TNL/Matrices/BiEllpack_impl.h @@ -24,9 +24,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ Index BiEllpack< Real, Device, Index, StripSize >::power( const IndexType number, const IndexType exponent ) const { @@ -131,9 +129,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ Index BiEllpack< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const { TNL_ASSERT( strip >= 0, @@ -148,9 +144,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ Index BiEllpack< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const { TNL_ASSERT( row >=0 && row < this->getRows(), @@ -256,9 +250,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ bool BiEllpack< Real, Device, Index, StripSize >::setElementFast( const IndexType row, const IndexType column, const RealType& value ) @@ -316,9 +308,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ bool BiEllpack< Real, Device, Index, StripSize >::addElementFast( const IndexType row, const IndexType column, const RealType& value, @@ -492,9 +482,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ Real BiEllpack< Real, Device, Index, StripSize >::getElementFast( const IndexType row, const IndexType column ) const { @@ -588,9 +576,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -#ifdef HAVE_CUDA -__device__ __host__ -#endif +__cuda_callable__ Index BiEllpack< Real, Device, Index, StripSize >::getGroupLength( const Index strip, const Index group ) const { @@ -1321,7 +1307,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -__device__ +__cuda_callable__ void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, const IndexType strip ) { @@ -1378,7 +1364,7 @@ template< typename Real, typename Device, typename Index, int StripSize > -__device__ +__cuda_callable__ void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, const IndexType numberOfStrips, const IndexType strip ) diff --git a/src/TNL/Matrices/EllpackSymmetric.h b/src/TNL/Matrices/EllpackSymmetric.h index b8d15ed30f..4d76a78175 100644 --- a/src/TNL/Matrices/EllpackSymmetric.h +++ b/src/TNL/Matrices/EllpackSymmetric.h @@ -65,9 +65,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index > bool copyFrom( const Matrix& matrix, const CompressedRowLengthsVector& rowLengths );*/ -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool setElementFast( const IndexType row, const IndexType column, const RealType& value ); @@ -76,9 +74,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index > const IndexType column, const RealType& value ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool addElementFast( const IndexType row, const IndexType column, const RealType& value, @@ -90,9 +86,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index > const RealType& thisElementMultiplicator = 1.0 ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool setRowFast( const IndexType row, const IndexType* columnIndexes, const RealType* values, @@ -104,9 +98,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index > const IndexType elements ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool addRowFast( const IndexType row, const IndexType* columns, const RealType* values, @@ -119,18 +111,14 @@ class EllpackSymmetric : public Sparse< Real, Device, Index > const IndexType numberOfElements, const RealType& thisElementMultiplicator = 1.0 ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ RealType getElementFast( const IndexType row, const IndexType column ) const; RealType getElement( const IndexType row, const IndexType column ) const; -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ void getRowFast( const IndexType row, IndexType* columns, RealType* values ) const; @@ -139,10 +127,8 @@ class EllpackSymmetric : public Sparse< Real, Device, Index > IndexType* columns, RealType* values ) const; -template< typename Vector > -#ifdef HAVE_CUDA - __device__ __host__ -#endif + template< typename Vector > + __cuda_callable__ typename Vector::RealType rowVectorProduct( const IndexType row, const Vector& vector ) const; @@ -183,9 +169,7 @@ template< typename Vector > template< typename InVector, typename OutVector > -#ifdef HAVE_CUDA - __device__ -#endif + __cuda_callable__ void spmvCuda( const InVector& inVector, OutVector& outVector, int rowIdx ) const; diff --git a/src/TNL/Matrices/EllpackSymmetricGraph.h b/src/TNL/Matrices/EllpackSymmetricGraph.h index 9ca2c93d6b..7b11b6b159 100644 --- a/src/TNL/Matrices/EllpackSymmetricGraph.h +++ b/src/TNL/Matrices/EllpackSymmetricGraph.h @@ -65,9 +65,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index > bool copyFrom( const Matrix& matrix, const CompressedRowLengthsVector& rowLengths );*/ -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool setElementFast( const IndexType row, const IndexType column, const RealType& value ); @@ -76,9 +74,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index > const IndexType column, const RealType& value ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool addElementFast( const IndexType row, const IndexType column, const RealType& value, @@ -90,9 +86,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index > const RealType& thisElementMultiplicator = 1.0 ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool setRowFast( const IndexType row, const IndexType* columnIndexes, const RealType* values, @@ -104,9 +98,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index > const IndexType elements ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool addRowFast( const IndexType row, const IndexType* columns, const RealType* values, @@ -119,18 +111,14 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index > const IndexType numberOfElements, const RealType& thisElementMultiplicator = 1.0 ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ RealType getElementFast( const IndexType row, const IndexType column ) const; RealType getElement( const IndexType row, const IndexType column ) const; -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ void getRowFast( const IndexType row, IndexType* columns, RealType* values ) const; @@ -139,10 +127,8 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index > IndexType* columns, RealType* values ) const; -template< typename Vector > -#ifdef HAVE_CUDA - __device__ __host__ -#endif + template< typename Vector > + __cuda_callable__ typename Vector::RealType rowVectorProduct( const IndexType row, const Vector& vector ) const; @@ -159,21 +145,17 @@ template< typename Vector > #ifdef HAVE_CUDA template< typename InVector, typename OutVector > - __device__ + __cuda_callable__ void spmvCuda( const InVector& inVector, OutVector& outVector, const int globalIdx, const int color ) const; #endif -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ void computePermutationArray(); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ bool rearrangeMatrix( bool verbose ); bool save( File& file ) const; @@ -190,39 +172,25 @@ template< typename Vector > void verifyPermutationArray(); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ Index getRowLengthsInt() const; -#ifdef HAVE_CUDA - __device__ __host__ -#endif - Index getAlignedRows() const; + __cuda_callable__ + Index getAlignedRows() const; -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ Index getRowsOfColor( IndexType color ) const; -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ void copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix ); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ Containers::Vector< Index, Device, Index > getPermutationArray(); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ Containers::Vector< Index, Device, Index > getInversePermutation(); -#ifdef HAVE_CUDA - __device__ __host__ -#endif + __cuda_callable__ Containers::Vector< Index, Device, Index > getColorPointers(); protected: -- GitLab