From f4144f9cef2d78557d845aa8937279cf778a8e21 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Sun, 2 Sep 2018 15:21:49 +0200
Subject: [PATCH] Replacing __device__ __host__ with __cuda_callable__.

---
 src/TNL/Matrices/BiEllpack.h               | 40 ++++---------
 src/TNL/Matrices/BiEllpackSymmetric.h      | 40 ++++---------
 src/TNL/Matrices/BiEllpackSymmetric_impl.h | 28 +++------
 src/TNL/Matrices/BiEllpack_impl.h          | 32 +++-------
 src/TNL/Matrices/EllpackSymmetric.h        | 34 +++--------
 src/TNL/Matrices/EllpackSymmetricGraph.h   | 70 ++++++----------------
 6 files changed, 64 insertions(+), 180 deletions(-)

diff --git a/src/TNL/Matrices/BiEllpack.h b/src/TNL/Matrices/BiEllpack.h
index 0ee7962fb6..8b0f54979e 100644
--- a/src/TNL/Matrices/BiEllpack.h
+++ b/src/TNL/Matrices/BiEllpack.h
@@ -66,9 +66,7 @@ public:
 					 const IndexType column,
 					 const RealType& value );
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	bool setElementFast( const IndexType row,
 						 const IndexType column,
 						 const RealType& value );
@@ -78,9 +76,7 @@ public:
 					 const RealType& value,
 					 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	bool addElementFast( const IndexType row,
 						 const IndexType column,
 						 const RealType& value,
@@ -100,9 +96,7 @@ public:
 	RealType getElement( const IndexType row,
 					 	 const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	RealType getElementFast( const IndexType row,
 							 const IndexType column ) const;
 
@@ -111,9 +105,7 @@ public:
 			 	    IndexType* columns,
 			 	    RealType* values ) const;
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	IndexType getGroupLength( const IndexType strip,
 							  const IndexType group ) const;
 
@@ -129,9 +121,7 @@ public:
 
 	void setVirtualRows(const IndexType rows);
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	IndexType getNumberOfGroups( const IndexType row ) const;
 
 	bool vectorProductTest() const;
@@ -155,36 +145,26 @@ public:
 
 	template< typename InVector,
 			  typename OutVector >
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	void spmvCuda( const InVector& inVector,
 				   OutVector& outVector,
 				   /*const IndexType warpStart,
 				   const IndexType inWarpIdx*/
 				   int globalIdx ) const;
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	IndexType getStripLength( const IndexType strip ) const;
 
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	void performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
 										 const IndexType strip );
 
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	void computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
 									   const IndexType numberOfStrips,
 									   const IndexType strip );
 
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	IndexType power( const IndexType number,
 				     const IndexType exponent ) const;
 
diff --git a/src/TNL/Matrices/BiEllpackSymmetric.h b/src/TNL/Matrices/BiEllpackSymmetric.h
index 6de40d0e34..51d672ce13 100644
--- a/src/TNL/Matrices/BiEllpackSymmetric.h
+++ b/src/TNL/Matrices/BiEllpackSymmetric.h
@@ -56,9 +56,7 @@ public:
 					 const IndexType column,
 					 const RealType& value );
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	bool setElementFast( const IndexType row,
 						 const IndexType column,
 						 const RealType& value );
@@ -68,9 +66,7 @@ public:
 					 const RealType& value,
 					 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	bool addElementFast( const IndexType row,
 						 const IndexType column,
 						 const RealType& value,
@@ -90,9 +86,7 @@ public:
 	RealType getElement( const IndexType row,
 					 	 const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	RealType getElementFast( const IndexType row,
 							 const IndexType column ) const;
 
@@ -100,9 +94,7 @@ public:
 			 	 IndexType* columns,
 			 	 RealType* values ) const;
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	IndexType getGroupLength( const IndexType strip,
 							  const IndexType group ) const;
 
@@ -118,9 +110,7 @@ public:
 
 	void setVirtualRows(const IndexType rows);
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	IndexType getNumberOfGroups( const IndexType row ) const;
 
 	bool vectorProductTest() const;
@@ -144,36 +134,26 @@ public:
 
 	template< typename InVector,
 			  typename OutVector >
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	void spmvCuda( const InVector& inVector,
 				   OutVector& outVector,
 				   /*const IndexType warpStart,
 				   const IndexType inWarpIdx*/
 				   int globalIdx ) const;
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	IndexType getStripLength( const IndexType strip ) const;
 
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	void performRowBubbleSortCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
 										 const IndexType strip );
 
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	void computeColumnSizesCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
 									   const IndexType numberOfStrips,
 									   const IndexType strip );
 
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	IndexType power( const IndexType number,
 				     const IndexType exponent ) const;
 
diff --git a/src/TNL/Matrices/BiEllpackSymmetric_impl.h b/src/TNL/Matrices/BiEllpackSymmetric_impl.h
index 374ad103d7..9a7f380eeb 100644
--- a/src/TNL/Matrices/BiEllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/BiEllpackSymmetric_impl.h
@@ -22,9 +22,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   __cuda_callable__
 Index BiEllpackSymmetric< Real, Device, Index, StripSize >::power( const IndexType number,
                                                                    const IndexType exponent ) const
 {
@@ -127,9 +125,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const
 {
     TNL_ASSERT( strip >= 0,
@@ -144,9 +140,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const
 {
     TNL_ASSERT( row >=0 && row < this->getRows(),
@@ -251,9 +245,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setElementFast( const IndexType row,
                                                                            const IndexType column,
                                                                            const RealType& value )
@@ -311,9 +303,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addElementFast( const IndexType row,
                                                                            const IndexType column,
                                                                            const RealType& value,
@@ -488,9 +478,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Real BiEllpackSymmetric< Real, Device, Index, StripSize >::getElementFast( const IndexType row,
                                                                            const IndexType column ) const
 {
@@ -584,9 +572,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getGroupLength( const Index strip,
                                                                             const Index group ) const
 {
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index d62ad35fd6..80b182db1e 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -24,9 +24,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpack< Real, Device, Index, StripSize >::power( const IndexType number,
 							   const IndexType exponent ) const
 {
@@ -131,9 +129,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpack< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const
 {
 	TNL_ASSERT( strip >= 0,
@@ -148,9 +144,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpack< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const
 {
 	TNL_ASSERT( row >=0 && row < this->getRows(),
@@ -256,9 +250,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool BiEllpack< Real, Device, Index, StripSize >::setElementFast( const IndexType row,
 																		   const IndexType column,
 																		   const RealType& value )
@@ -316,9 +308,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool BiEllpack< Real, Device, Index, StripSize >::addElementFast( const IndexType row,
 																	   	   const IndexType column,
 																	   	   const RealType& value,
@@ -492,9 +482,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Real BiEllpack< Real, Device, Index, StripSize >::getElementFast( const IndexType row,
 																	   	   const IndexType column ) const
 {
@@ -588,9 +576,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpack< Real, Device, Index, StripSize >::getGroupLength( const Index strip,
 																 	 	    const Index group ) const
 {
@@ -1321,7 +1307,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-__device__
+__cuda_callable__
 void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
 																						   const IndexType strip )
 {
@@ -1378,7 +1364,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-__device__
+__cuda_callable__
 void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
 																						 const IndexType numberOfStrips,
 																						 const IndexType strip )
diff --git a/src/TNL/Matrices/EllpackSymmetric.h b/src/TNL/Matrices/EllpackSymmetric.h
index b8d15ed30f..4d76a78175 100644
--- a/src/TNL/Matrices/EllpackSymmetric.h
+++ b/src/TNL/Matrices/EllpackSymmetric.h
@@ -65,9 +65,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
    bool copyFrom( const Matrix& matrix,
                   const CompressedRowLengthsVector& rowLengths );*/
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -76,9 +74,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -90,9 +86,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
                     const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columnIndexes,
                     const RealType* values,
@@ -104,9 +98,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
                 const IndexType elements );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -119,18 +111,14 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
                 const IndexType numberOfElements,
                 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
@@ -139,10 +127,8 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
                 IndexType* columns,
                 RealType* values ) const;
 
-template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   template< typename Vector >
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -183,9 +169,7 @@ template< typename Vector >
 
    template< typename InVector,
              typename OutVector >
-#ifdef HAVE_CUDA
-   __device__
-#endif
+   __cuda_callable__
    void spmvCuda( const InVector& inVector,
                   OutVector& outVector,
                   int rowIdx ) const;
diff --git a/src/TNL/Matrices/EllpackSymmetricGraph.h b/src/TNL/Matrices/EllpackSymmetricGraph.h
index 9ca2c93d6b..7b11b6b159 100644
--- a/src/TNL/Matrices/EllpackSymmetricGraph.h
+++ b/src/TNL/Matrices/EllpackSymmetricGraph.h
@@ -65,9 +65,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
    bool copyFrom( const Matrix& matrix,
                   const CompressedRowLengthsVector& rowLengths );*/
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -76,9 +74,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -90,9 +86,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
                     const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columnIndexes,
                     const RealType* values,
@@ -104,9 +98,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
                 const IndexType elements );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -119,18 +111,14 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
                 const IndexType numberOfElements,
                 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
@@ -139,10 +127,8 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
                 IndexType* columns,
                 RealType* values ) const;
 
-template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   template< typename Vector >
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -159,21 +145,17 @@ template< typename Vector >
 #ifdef HAVE_CUDA
    template< typename InVector,
              typename OutVector >
-   __device__
+   __cuda_callable__
    void spmvCuda( const InVector& inVector,
                   OutVector& outVector,
                   const int globalIdx,
                   const int color ) const;
 #endif
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void computePermutationArray();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool rearrangeMatrix( bool verbose );
 
    bool save( File& file ) const;
@@ -190,39 +172,25 @@ template< typename Vector >
 
    void verifyPermutationArray();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getRowLengthsInt() const;
 
-#ifdef HAVE_CUDA
-    __device__ __host__
-#endif
-    Index getAlignedRows() const;
+   __cuda_callable__
+   Index getAlignedRows() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getRowsOfColor( IndexType color ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Containers::Vector< Index, Device, Index > getPermutationArray();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Containers::Vector< Index, Device, Index > getInversePermutation();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Containers::Vector< Index, Device, Index > getColorPointers();
 
    protected:
-- 
GitLab