diff --git a/src/TNL/Matrices/BiEllpack.h b/src/TNL/Matrices/BiEllpack.h
index 0ee7962fb6818ddb1a629b80cb95f18eb23a677f..8b0f54979e5c9e7c3b4abe270d11218811c5b98f 100644
--- a/src/TNL/Matrices/BiEllpack.h
+++ b/src/TNL/Matrices/BiEllpack.h
@@ -66,9 +66,7 @@ public:
 					 const IndexType column,
 					 const RealType& value );
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	bool setElementFast( const IndexType row,
 						 const IndexType column,
 						 const RealType& value );
@@ -78,9 +76,7 @@ public:
 					 const RealType& value,
 					 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	bool addElementFast( const IndexType row,
 						 const IndexType column,
 						 const RealType& value,
@@ -100,9 +96,7 @@ public:
 	RealType getElement( const IndexType row,
 					 	 const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	RealType getElementFast( const IndexType row,
 							 const IndexType column ) const;
 
@@ -111,9 +105,7 @@ public:
 			 	    IndexType* columns,
 			 	    RealType* values ) const;
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	IndexType getGroupLength( const IndexType strip,
 							  const IndexType group ) const;
 
@@ -129,9 +121,7 @@ public:
 
 	void setVirtualRows(const IndexType rows);
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	IndexType getNumberOfGroups( const IndexType row ) const;
 
 	bool vectorProductTest() const;
@@ -155,36 +145,26 @@ public:
 
 	template< typename InVector,
 			  typename OutVector >
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	void spmvCuda( const InVector& inVector,
 				   OutVector& outVector,
 				   /*const IndexType warpStart,
 				   const IndexType inWarpIdx*/
 				   int globalIdx ) const;
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	IndexType getStripLength( const IndexType strip ) const;
 
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	void performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
 										 const IndexType strip );
 
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	void computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
 									   const IndexType numberOfStrips,
 									   const IndexType strip );
 
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	IndexType power( const IndexType number,
 				     const IndexType exponent ) const;
 
diff --git a/src/TNL/Matrices/BiEllpackSymmetric.h b/src/TNL/Matrices/BiEllpackSymmetric.h
index 6de40d0e3462cd5c9d3061bb5d4a383ecaf3f37d..51d672ce1377b954c2289a10878a24b56842c558 100644
--- a/src/TNL/Matrices/BiEllpackSymmetric.h
+++ b/src/TNL/Matrices/BiEllpackSymmetric.h
@@ -56,9 +56,7 @@ public:
 					 const IndexType column,
 					 const RealType& value );
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	bool setElementFast( const IndexType row,
 						 const IndexType column,
 						 const RealType& value );
@@ -68,9 +66,7 @@ public:
 					 const RealType& value,
 					 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	bool addElementFast( const IndexType row,
 						 const IndexType column,
 						 const RealType& value,
@@ -90,9 +86,7 @@ public:
 	RealType getElement( const IndexType row,
 					 	 const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	RealType getElementFast( const IndexType row,
 							 const IndexType column ) const;
 
@@ -100,9 +94,7 @@ public:
 			 	 IndexType* columns,
 			 	 RealType* values ) const;
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	IndexType getGroupLength( const IndexType strip,
 							  const IndexType group ) const;
 
@@ -118,9 +110,7 @@ public:
 
 	void setVirtualRows(const IndexType rows);
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	IndexType getNumberOfGroups( const IndexType row ) const;
 
 	bool vectorProductTest() const;
@@ -144,36 +134,26 @@ public:
 
 	template< typename InVector,
 			  typename OutVector >
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	void spmvCuda( const InVector& inVector,
 				   OutVector& outVector,
 				   /*const IndexType warpStart,
 				   const IndexType inWarpIdx*/
 				   int globalIdx ) const;
 
-#ifdef HAVE_CUDA
-	__device__ __host__
-#endif
+   __cuda_callable__
 	IndexType getStripLength( const IndexType strip ) const;
 
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	void performRowBubbleSortCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
 										 const IndexType strip );
 
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	void computeColumnSizesCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
 									   const IndexType numberOfStrips,
 									   const IndexType strip );
 
-#ifdef HAVE_CUDA
-	__device__
-#endif
+   __cuda_callable__
 	IndexType power( const IndexType number,
 				     const IndexType exponent ) const;
 
diff --git a/src/TNL/Matrices/BiEllpackSymmetric_impl.h b/src/TNL/Matrices/BiEllpackSymmetric_impl.h
index 374ad103d78fff65361659a95b262918db2060a0..9a7f380eebe7c5b54785055060594967275faa65 100644
--- a/src/TNL/Matrices/BiEllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/BiEllpackSymmetric_impl.h
@@ -22,9 +22,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   __cuda_callable__
 Index BiEllpackSymmetric< Real, Device, Index, StripSize >::power( const IndexType number,
                                                                    const IndexType exponent ) const
 {
@@ -127,9 +125,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const
 {
     TNL_ASSERT( strip >= 0,
@@ -144,9 +140,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const
 {
     TNL_ASSERT( row >=0 && row < this->getRows(),
@@ -251,9 +245,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setElementFast( const IndexType row,
                                                                            const IndexType column,
                                                                            const RealType& value )
@@ -311,9 +303,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addElementFast( const IndexType row,
                                                                            const IndexType column,
                                                                            const RealType& value,
@@ -488,9 +478,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Real BiEllpackSymmetric< Real, Device, Index, StripSize >::getElementFast( const IndexType row,
                                                                            const IndexType column ) const
 {
@@ -584,9 +572,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getGroupLength( const Index strip,
                                                                             const Index group ) const
 {
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index d62ad35fd6d76acc18ff67108cb487eed8eeb8e9..80b182db1e61cb73d4a82a20d5772935f06436e0 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -24,9 +24,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpack< Real, Device, Index, StripSize >::power( const IndexType number,
 							   const IndexType exponent ) const
 {
@@ -131,9 +129,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpack< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const
 {
 	TNL_ASSERT( strip >= 0,
@@ -148,9 +144,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpack< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const
 {
 	TNL_ASSERT( row >=0 && row < this->getRows(),
@@ -256,9 +250,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool BiEllpack< Real, Device, Index, StripSize >::setElementFast( const IndexType row,
 																		   const IndexType column,
 																		   const RealType& value )
@@ -316,9 +308,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool BiEllpack< Real, Device, Index, StripSize >::addElementFast( const IndexType row,
 																	   	   const IndexType column,
 																	   	   const RealType& value,
@@ -492,9 +482,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Real BiEllpack< Real, Device, Index, StripSize >::getElementFast( const IndexType row,
 																	   	   const IndexType column ) const
 {
@@ -588,9 +576,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index BiEllpack< Real, Device, Index, StripSize >::getGroupLength( const Index strip,
 																 	 	    const Index group ) const
 {
@@ -1321,7 +1307,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-__device__
+__cuda_callable__
 void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
 																						   const IndexType strip )
 {
@@ -1378,7 +1364,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-__device__
+__cuda_callable__
 void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
 																						 const IndexType numberOfStrips,
 																						 const IndexType strip )
diff --git a/src/TNL/Matrices/EllpackSymmetric.h b/src/TNL/Matrices/EllpackSymmetric.h
index b8d15ed30f12fd7881915397696d4330f9181bf3..4d76a781756ac6abc1e96d4aef1a55bd88d34033 100644
--- a/src/TNL/Matrices/EllpackSymmetric.h
+++ b/src/TNL/Matrices/EllpackSymmetric.h
@@ -65,9 +65,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
    bool copyFrom( const Matrix& matrix,
                   const CompressedRowLengthsVector& rowLengths );*/
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -76,9 +74,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -90,9 +86,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
                     const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columnIndexes,
                     const RealType* values,
@@ -104,9 +98,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
                 const IndexType elements );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -119,18 +111,14 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
                 const IndexType numberOfElements,
                 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
@@ -139,10 +127,8 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
                 IndexType* columns,
                 RealType* values ) const;
 
-template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   template< typename Vector >
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -183,9 +169,7 @@ template< typename Vector >
 
    template< typename InVector,
              typename OutVector >
-#ifdef HAVE_CUDA
-   __device__
-#endif
+   __cuda_callable__
    void spmvCuda( const InVector& inVector,
                   OutVector& outVector,
                   int rowIdx ) const;
diff --git a/src/TNL/Matrices/EllpackSymmetricGraph.h b/src/TNL/Matrices/EllpackSymmetricGraph.h
index 9ca2c93d6bfd731d43250a060ee7a615d55ea1a3..7b11b6b159adcf658f62f5af76941e7989636578 100644
--- a/src/TNL/Matrices/EllpackSymmetricGraph.h
+++ b/src/TNL/Matrices/EllpackSymmetricGraph.h
@@ -65,9 +65,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
    bool copyFrom( const Matrix& matrix,
                   const CompressedRowLengthsVector& rowLengths );*/
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -76,9 +74,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -90,9 +86,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
                     const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columnIndexes,
                     const RealType* values,
@@ -104,9 +98,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
                 const IndexType elements );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -119,18 +111,14 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
                 const IndexType numberOfElements,
                 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
@@ -139,10 +127,8 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
                 IndexType* columns,
                 RealType* values ) const;
 
-template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   template< typename Vector >
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -159,21 +145,17 @@ template< typename Vector >
 #ifdef HAVE_CUDA
    template< typename InVector,
              typename OutVector >
-   __device__
+   __cuda_callable__
    void spmvCuda( const InVector& inVector,
                   OutVector& outVector,
                   const int globalIdx,
                   const int color ) const;
 #endif
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void computePermutationArray();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool rearrangeMatrix( bool verbose );
 
    bool save( File& file ) const;
@@ -190,39 +172,25 @@ template< typename Vector >
 
    void verifyPermutationArray();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getRowLengthsInt() const;
 
-#ifdef HAVE_CUDA
-    __device__ __host__
-#endif
-    Index getAlignedRows() const;
+   __cuda_callable__
+   Index getAlignedRows() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getRowsOfColor( IndexType color ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Containers::Vector< Index, Device, Index > getPermutationArray();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Containers::Vector< Index, Device, Index > getInversePermutation();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Containers::Vector< Index, Device, Index > getColorPointers();
 
    protected: