From b927423bc60a5daa2606549f9068edc1c7b2562a Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Thu, 17 Oct 2019 20:48:13 +0200
Subject: [PATCH] Added basic functionality for cross-device copy assignment.
 Removed StripSize template typename as it was never used anywhere.

---
 src/TNL/Matrices/BiEllpack.h      |  40 +-
 src/TNL/Matrices/BiEllpack_impl.h | 612 ++++++++++++++----------------
 2 files changed, 310 insertions(+), 342 deletions(-)

diff --git a/src/TNL/Matrices/BiEllpack.h b/src/TNL/Matrices/BiEllpack.h
index 050f0c8e07..b6fd8ab5ae 100644
--- a/src/TNL/Matrices/BiEllpack.h
+++ b/src/TNL/Matrices/BiEllpack.h
@@ -28,9 +28,19 @@ namespace TNL {
 template< typename Device >
 class BiEllpackDeviceDependentCode;
 
-template< typename Real, typename Device = Devices::Cuda, typename Index = int, int StripSize = 32 >
+template< typename Real, typename Device /*= Devices::Cuda*/, typename Index /*= int*/ >
 class BiEllpack : public Sparse< Real, Device, Index >
 {
+private:
+    
+    // convenient template alias for controlling the selection of copy-assignment operator
+    template< typename Device2 >
+    using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >;
+
+    // friend class will be needed for templated assignment operators
+    template< typename Real2, typename Device2, typename Index2 >
+    friend class BiEllpack;
+    
 public:
 	typedef Real RealType;
 	typedef Device DeviceType;
@@ -57,7 +67,15 @@ public:
 	template< typename Real2,
 			  typename Device2,
 			  typename Index2 >
-	void setLike( const BiEllpack< Real2, Device2, Index2, StripSize >& matrix );
+	void setLike( const BiEllpack< Real2, Device2, Index2 >& matrix );
+        
+        void reset();
+        
+        template< typename Real2, typename Device2, typename Index2 >
+        bool operator == ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const;
+
+        template< typename Real2, typename Device2, typename Index2 >
+        bool operator != ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const;
 
 	void getRowLengths( CompressedRowLengthsVector& rowLengths ) const;
 
@@ -124,8 +142,14 @@ public:
 	IndexType getNumberOfGroups( const IndexType row ) const;
 
 	bool vectorProductTest() const;
+        
+        // copy assignment
+        BiEllpack& operator=( const BiEllpack& matrix );
 
-	void reset();
+        // cross-device copy assignment
+        template< typename Real2, typename Device2, typename Index2,
+                 typename = typename Enabler< Device2 >::type >
+        BiEllpack& operator=( const BiEllpack< Real2, Device2, Index2 >& matrix );
 
 	void save( File& file ) const;
 
@@ -136,11 +160,13 @@ public:
 	void load( const String& fileName );
 
 	void print( std::ostream& str ) const;
+        
+        void printValues() const;
 
 	void performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths );
 	void computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths );
 
-//	void verifyRowLengths( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths );
+//	void verifyRowLengths( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths );
 
 	template< typename InVector,
 			  typename OutVector >
@@ -157,11 +183,11 @@ public:
 	IndexType getStripLength( const IndexType strip ) const;
 
    __cuda_callable__
-	void performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
+	void performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths,
 										 const IndexType strip );
 
    __cuda_callable__
-	void computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
+	void computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths,
 									   const IndexType numberOfStrips,
 									   const IndexType strip );
 
@@ -171,6 +197,8 @@ public:
 
 	typedef BiEllpackDeviceDependentCode< DeviceType > DeviceDependentCode;
 	friend class BiEllpackDeviceDependentCode< DeviceType >;
+        friend class BiEllpack< RealType, Devices::Host, IndexType >;
+        friend class BiEllpack< RealType, Devices::Cuda, IndexType >;
 
 private:
 
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index 95ce47a794..4974c8c7ea 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -22,10 +22,9 @@ namespace TNL {
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index>
 __cuda_callable__
-Index BiEllpack< Real, Device, Index, StripSize >::power( const IndexType number,
+Index BiEllpack< Real, Device, Index >::power( const IndexType number,
 							  const IndexType exponent ) const
 {
     if( exponent >= 0 )
@@ -40,18 +39,16 @@ Index BiEllpack< Real, Device, Index, StripSize >::power( const IndexType number
 
 template< typename Real,
 	  typename Device,
-	  typename Index,
-	  int StripSize >
-BiEllpack< Real, Device, Index, StripSize >::BiEllpack()
+	  typename Index >
+BiEllpack< Real, Device, Index >::BiEllpack()
 : warpSize( 32 ),
   logWarpSize( 5 )
 {}
 
 template< typename Real,
 	  typename Device,
-	  typename Index,
-	  int StripSize >
-String BiEllpack< Real, Device, Index, StripSize >::getType()
+	  typename Index >
+String BiEllpack< Real, Device, Index >::getType()
 {
 	return String( "Matrices::BiEllpack< ") +
 	       String( TNL::getType< Real >() ) +
@@ -64,19 +61,17 @@ String BiEllpack< Real, Device, Index, StripSize >::getType()
 
 template< typename Real,
 	  typename Device,
-	  typename Index,
-	  int StripSize >
-String BiEllpack< Real, Device, Index, StripSize >::getTypeVirtual() const
+	  typename Index >
+String BiEllpack< Real, Device, Index >::getTypeVirtual() const
 {
     return this->getType();
 }
 
 template< typename Real,
 	  typename Device,
-	  typename Index,
-	  int StripSize >
+	  typename Index >
 void
-BiEllpack< Real, Device, Index, StripSize >::
+BiEllpack< Real, Device, Index >::
 setDimensions( const IndexType rows, const IndexType columns )
 {
    TNL_ASSERT( rows >= 0 && columns >= 0, std::cerr << "rows = " << rows << "columns = " << columns << std::endl );
@@ -97,10 +92,9 @@ setDimensions( const IndexType rows, const IndexType columns )
 
 template< typename Real,
 	  typename Device,
-	  typename Index,
-	  int StripSize >
+	  typename Index >
 void
-BiEllpack< Real, Device, Index, StripSize >::
+BiEllpack< Real, Device, Index >::
 setCompressedRowLengths( ConstCompressedRowLengthsVectorView constRowLengths )
 {
     // This method has to have the const argument, bcs its base method
@@ -115,8 +109,7 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView constRowLengths )
     rowLengths.setLike( constRowLengths );
     
     // Copy the elements from the const vector to the non-const
-    for( IndexType i = 0; i < rowLengths.getSize(); i++ )
-        rowLengths.setElement( i, constRowLengths.getElement( i ) );
+    rowLengths = constRowLengths;
     
     if( this->getRows() % this->warpSize != 0 )
             this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) );
@@ -142,10 +135,9 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView constRowLengths )
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-Index BiEllpack< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const
+Index BiEllpack< Real, Device, Index >::getStripLength( const IndexType strip ) const
 {
 	TNL_ASSERT( strip >= 0, std::cerr << "strip = " << strip );
 
@@ -155,12 +147,11 @@ Index BiEllpack< Real, Device, Index, StripSize >::getStripLength( const IndexTy
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-Index BiEllpack< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const
+Index BiEllpack< Real, Device, Index >::getNumberOfGroups( const IndexType row ) const
 {
-	TNL_ASSERT( row >=0 && row < this->getRows(),
+	TNL_ASSERT( row >= 0 && row < this->getRows(),
 	            std::cerr <<  "row = " << row
                               << " this->getRows() = " << this->getRows() );
 
@@ -184,9 +175,8 @@ Index BiEllpack< Real, Device, Index, StripSize >::getNumberOfGroups( const Inde
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-Index BiEllpack< Real, Device, Index, StripSize >::getRowLength( const IndexType row ) const
+		  typename Index >
+Index BiEllpack< Real, Device, Index >::getRowLength( const IndexType row ) const
 {
 	TNL_ASSERT( row >= 0 && row < this->getRows(), 
                     std::cerr << "row = " << row << " this->getRows() = " << this->getRows() );
@@ -217,12 +207,11 @@ Index BiEllpack< Real, Device, Index, StripSize >::getRowLength( const IndexType
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
+		  typename Index >
 	template< typename Real2,
 			  typename Device2,
 			  typename Index2 >
-void BiEllpack< Real, Device, Index, StripSize >::setLike( const BiEllpack< Real2, Device2, Index2, StripSize >& matrix )
+void BiEllpack< Real, Device, Index >::setLike( const BiEllpack< Real2, Device2, Index2 >& matrix )
 {        
 	Sparse< Real, Device, Index >::setLike( matrix );
 	this->rowPermArray.setLike( matrix.rowPermArray );
@@ -231,9 +220,50 @@ void BiEllpack< Real, Device, Index, StripSize >::setLike( const BiEllpack< Real
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::getRowLengths( CompressedRowLengthsVector& rowLengths) const
+		  typename Index >
+void BiEllpack< Real, Device, Index >::reset()
+{
+	Sparse< Real, Device, Index >::reset();
+	this->rowPermArray.reset();
+	this->groupPointers.reset();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Real2,
+             typename Device2,
+             typename Index2 >
+bool BiEllpack< Real, Device, Index >::operator == ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const
+{
+   TNL_ASSERT( this->getRows() == matrix.getRows() &&
+               this->getColumns() == matrix.getColumns(),
+               std::cerr << "this->getRows() = " << this->getRows()
+                    << " matrix.getRows() = " << matrix.getRows()
+                    << " this->getColumns() = " << this->getColumns()
+                    << " matrix.getColumns() = " << matrix.getColumns() );
+   
+   TNL_ASSERT_TRUE( false, "operator == is not yet implemented for BiEllpack.");
+   
+   // TODO: implement this
+   return false;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Real2,
+             typename Device2,
+             typename Index2 >
+bool BiEllpack< Real, Device, Index >::operator != ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const
+{
+   return ! ( ( *this ) == matrix );
+}
+
+template< typename Real,
+		  typename Device,
+		  typename Index >
+void BiEllpack< Real, Device, Index >::getRowLengths( CompressedRowLengthsVector& rowLengths) const
 {
     // WHAT IS THIS??!
     // It's called getRowLengths, but takes an argument that it fill up with this matrix's row lengths???
@@ -243,10 +273,9 @@ void BiEllpack< Real, Device, Index, StripSize >::getRowLengths( CompressedRowLe
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
+		  typename Index >
 bool
-BiEllpack< Real, Device, Index, StripSize >::
+BiEllpack< Real, Device, Index >::
 setElement( const IndexType row,
             const IndexType column,
             const RealType& value )
@@ -262,10 +291,9 @@ setElement( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-bool BiEllpack< Real, Device, Index, StripSize >::setElementFast( const IndexType row,
+bool BiEllpack< Real, Device, Index >::setElementFast( const IndexType row,
 								  const IndexType column,
 								  const RealType& value )
 {
@@ -280,9 +308,8 @@ bool BiEllpack< Real, Device, Index, StripSize >::setElementFast( const IndexTyp
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-bool BiEllpack< Real, Device, Index, StripSize >::addElement( const IndexType row,
+		  typename Index >
+bool BiEllpack< Real, Device, Index >::addElement( const IndexType row,
                                                               const IndexType column,
                                                               const RealType& value,
                                                               const RealType& thisElementMultiplicator )
@@ -319,10 +346,9 @@ bool BiEllpack< Real, Device, Index, StripSize >::addElement( const IndexType ro
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-bool BiEllpack< Real, Device, Index, StripSize >::addElementFast( const IndexType row,
+bool BiEllpack< Real, Device, Index >::addElementFast( const IndexType row,
 								  const IndexType column,
 								  const RealType& value,
 								  const RealType& thisElementMultiplicator )
@@ -374,10 +400,9 @@ bool BiEllpack< Real, Device, Index, StripSize >::addElementFast( const IndexTyp
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
+		  typename Index >
 bool
-BiEllpack< Real, Device, Index, StripSize >::
+BiEllpack< Real, Device, Index >::
 setRow( const IndexType row,
 	const IndexType* columns,
 	const RealType* values,
@@ -413,10 +438,9 @@ setRow( const IndexType row,
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
+		  typename Index >
 bool
-BiEllpack< Real, Device, Index, StripSize >::
+BiEllpack< Real, Device, Index >::
 addRow( const IndexType row,
         const IndexType* columns,
         const RealType* values,
@@ -457,9 +481,8 @@ addRow( const IndexType row,
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-Real BiEllpack< Real, Device, Index, StripSize >::getElement( const IndexType row,
+		  typename Index >
+Real BiEllpack< Real, Device, Index >::getElement( const IndexType row,
                                                               const IndexType column ) const
 {
 	TNL_ASSERT( ( row >= 0 && row < this->getRows() ) ||
@@ -491,10 +514,9 @@ Real BiEllpack< Real, Device, Index, StripSize >::getElement( const IndexType ro
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-Real BiEllpack< Real, Device, Index, StripSize >::getElementFast( const IndexType row,
+Real BiEllpack< Real, Device, Index >::getElementFast( const IndexType row,
 								  const IndexType column ) const
 {
     const IndexType strip = row / this->warpSize;
@@ -535,9 +557,8 @@ Real BiEllpack< Real, Device, Index, StripSize >::getElementFast( const IndexTyp
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::getRow( const IndexType row,
+		  typename Index >
+void BiEllpack< Real, Device, Index >::getRow( const IndexType row,
 							  IndexType* columns,
 							  RealType* values ) const
 {
@@ -575,19 +596,17 @@ void BiEllpack< Real, Device, Index, StripSize >::getRow( const IndexType row,
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::setVirtualRows(const IndexType rows)
+		  typename Index >
+void BiEllpack< Real, Device, Index >::setVirtualRows(const IndexType rows)
 {
     this->virtualRows = rows;
 }
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-Index BiEllpack< Real, Device, Index, StripSize >::getGroupLength( const Index strip,
+Index BiEllpack< Real, Device, Index >::getGroupLength( const Index strip,
                                                                    const Index group ) const
 {
     return this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) + group + 1 )
@@ -596,11 +615,10 @@ Index BiEllpack< Real, Device, Index, StripSize >::getGroupLength( const Index s
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 template< typename InVector,
 	  	  typename OutVector >
-void BiEllpack< Real, Device, Index, StripSize >::vectorProduct( const InVector& inVector,
+void BiEllpack< Real, Device, Index >::vectorProduct( const InVector& inVector,
 								 OutVector& outVector ) const
 {
     DeviceDependentCode::vectorProduct( *this, inVector, outVector );
@@ -608,11 +626,10 @@ void BiEllpack< Real, Device, Index, StripSize >::vectorProduct( const InVector&
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 template< typename InVector,
 		  typename OutVector >
-void BiEllpack< Real, Device, Index, StripSize >::vectorProductHost( const InVector& inVector,
+void BiEllpack< Real, Device, Index >::vectorProductHost( const InVector& inVector,
                                                                      OutVector& outVector ) const
 {
 	const IndexType cudaBlockSize = 256;
@@ -668,22 +685,113 @@ void BiEllpack< Real, Device, Index, StripSize >::vectorProductHost( const InVec
 	}
 }
 
+// copy assignment
 template< typename Real,
-		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::reset()
+          typename Device,
+          typename Index >
+BiEllpack< Real, Device, Index >&
+BiEllpack< Real, Device, Index >::operator=( const BiEllpack& matrix )
 {
-	Sparse< Real, Device, Index >::reset();
-	this->rowPermArray.reset();
-	this->groupPointers.reset();
+   this->setLike( matrix );
+   this->values = matrix.values;
+   this->columnIndexes = matrix.columnIndexes;
+   this->warpSize = matrix.warpSize;
+   this->logWarpSize = matrix.logWarpSize;
+   this->virtualRows = matrix.virtualRows;
+   this->rowPermArray = matrix.rowPermArray;
+   this->groupPointers = matrix.groupPointers;
+   return *this;
+}
+
+// cross-device copy assignment
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Real2, typename Device2, typename Index2, typename >
+BiEllpack< Real, Device, Index >&
+BiEllpack< Real, Device, Index >::operator=( const BiEllpack< Real2, Device2, Index2 >& matrix )
+{
+   static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value,
+                  "unknown device" );
+   static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value,
+                  "unknown device" );
+   
+   std::cout << "Inside operator=\n\n" << std::endl;
+   for( Index i = 0; i < this->values.getSize(); i++ ) {
+    // Random values are stored with the column index of getColumns(). e.g. a matrix has 4 columns, values are at column indexes 0, 1, 2, 3 and junk data at index 4.
+    if( this->columnIndexes.getElement( i ) != this->getColumns() )
+        std::cout << "values.getElement( " << i << " ) = " << this->values.getElement( i ) 
+         << "\tcolumnIndexes.getElement( " << i << " ) = " << this->columnIndexes.getElement( i ) << std::endl;
+    }
+    
+    for( Index i = 0; i < this->rowPermArray.getSize(); i++ ) {
+        std::cout << "rowPermArray[ " << i << " ] = " << this->rowPermArray.getElement( i ) << std::endl;
+    }
+//   TNL_ASSERT_TRUE( false, "Cross-device copy assignment is not yet implemented for BiEllpack.");
+   
+   this->setLike( matrix );
+   this->warpSize = matrix.warpSize;
+   this->logWarpSize = matrix.logWarpSize;
+   this->virtualRows = matrix.virtualRows;
+   this->rowPermArray = matrix.rowPermArray;
+   this->groupPointers = matrix.groupPointers;
+   
+   // cuda -> host
+   // The order of elmements in values is:
+   //   Groups in a Strip are stored after each other in column-major order.
+   // Have a look in: "static void verifyRowLengths" at line: 1406.
+   //   There is an interesting piece of code that could crack how groupPointers is being used.
+   if( std::is_same< Device, Devices::Host >::value ) {
+       typename ValuesVector::HostType tmpValues;
+       typename ColumnIndexesVector::HostType tmpColumnIndexes;
+       tmpValues.setLike( matrix.values );
+       tmpColumnIndexes.setLike( matrix.columnIndexes );
+       
+       Index numberOfStrips = this->virtualRows / this->warpSize;
+#ifdef HAVE_OPENMP
+#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
+#endif       
+       for( Index stripIdx = 0; stripIdx < numberOfStrips; stripIdx++ ) {
+           
+       }
+   }
+   
+   // Per strip
+   //   per group
+   //       per row
+   //           per element
+   //               copy element
+   
+   
+   // host -> cuda
+   if( std::is_same< Device, Devices::Cuda >::value ) {
+       typename ValuesVector::HostType tmpValues;
+       typename ColumnIndexesVector::HostType tmpColumnIndexes;
+       tmpValues.setLike( matrix.values );
+       tmpColumnIndexes.setLike( matrix.columnIndexes );
+       tmpValues = matrix.values;
+       tmpColumnIndexes = matrix.columnIndexes;
+       
+       Index numberOfStrips = this->virtualRows / this->warpSize;
+#ifdef HAVE_OPENMP
+#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
+#endif       
+       for( Index stripIdx = 0; stripIdx < numberOfStrips; stripIdx++ ) {
+           
+       }
+   }
+   
+   if( std::is_same< Device, Devices::MIC >::value ) {
+      throw std::runtime_error("Not Implemented yet for MIC");
+   }
+   
+   return *this;
 }
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::save( File& file ) const
+		  typename Index >
+bool BiEllpack< Real, Device, Index >::save( File& file ) const
 {
    Sparse< Real, Device, Index >::save( file );
    file << this->groupPointers << this->rowPermArray;
@@ -691,9 +799,8 @@ void BiEllpack< Real, Device, Index, StripSize >::save( File& file ) const
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::load( File& file )
+		  typename Index >
+bool BiEllpack< Real, Device, Index >::load( File& file )
 {
    Sparse< Real, Device, Index >::load( file );
    file >> this->groupPointers >> this->rowPermArray;
@@ -701,31 +808,29 @@ void BiEllpack< Real, Device, Index, StripSize >::load( File& file )
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::save( const String& fileName ) const
+		  typename Index >
+bool BiEllpack< Real, Device, Index >::save( const String& fileName ) const
 {
    Object::save( fileName );
 }
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::load( const String& fileName )
+		  typename Index >
+bool BiEllpack< Real, Device, Index >::load( const String& fileName )
 {
    Object::load( fileName );
 }
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::print( std::ostream& str ) const
+		  typename Index >
+void BiEllpack< Real, Device, Index >::print( std::ostream& str ) const
 {
 	for( IndexType row = 0; row < this->getRows(); row++ )
 	{
 		str <<"Row: " << row << " -> ";
+//                str << row << ": ";
 		bool padding = false;
 		const IndexType strip = row / this->warpSize;
 		const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
@@ -746,6 +851,7 @@ void BiEllpack< Real, Device, Index, StripSize >::print( std::ostream& str ) con
 				RealType value = this->values.getElement( elementPtr );
 				IndexType column = this->columnIndexes.getElement( elementPtr );
 				str << " Col:" << column << "->" << value << "\t";
+//                                str << value << " ";
 				elementPtr += step;
 			}
 			step /= 2;
@@ -755,11 +861,31 @@ void BiEllpack< Real, Device, Index, StripSize >::print( std::ostream& str ) con
 	}
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+void BiEllpack< Real, Device, Index >::printValues() const
+{
+    for( Index i = 0; i < this->values.getSize(); i++ ) {
+    // Random values are stored with the column index of getColumns(). e.g. a matrix has 4 columns, values are at column indexes 0, 1, 2, 3 and junk data at index 4.
+    if( this->columnIndexes.getElement( i ) != this->getColumns() )
+        std::cout << "values.getElement( " << i << " ) = " << this->values.getElement( i ) 
+         << "\tcolumnIndexes.getElement( " << i << " ) = " << this->columnIndexes.getElement( i ) << std::endl;
+    }
+    
+    for( Index i = 0; i < this->rowPermArray.getSize(); i++ ) {
+        std::cout << "rowPermArray[ " << i << " ] = " << this->rowPermArray.getElement( i ) << std::endl;
+    }
+    
+    for( Index i = 0; i < this->groupPointers.getSize(); i++ ) {
+        std::cout << "groupPointers[ " << i << " ] = " << this->groupPointers.getElement( i ) << std::endl;
+    }
+}
+
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths )
+		  typename Index >
+void BiEllpack< Real, Device, Index >::performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths )
 {
     Index strips = this->virtualRows / this->warpSize;
     for( Index i = 0; i < strips; i++ )
@@ -816,9 +942,8 @@ void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSort( Containe
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths )
+		  typename Index >
+void BiEllpack< Real, Device, Index >::computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths )
 {
     Index numberOfStrips = this->virtualRows / this->warpSize;
     for( Index strip = 0; strip < numberOfStrips; strip++ )
@@ -862,10 +987,9 @@ public:
 	typedef Devices::Host Device;
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void verifyRowLengths( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-                                      const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			  typename Index >
+	static void verifyRowLengths( const BiEllpack< Real, Device, Index >& matrix,
+                                      const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 		bool ok = true;
 		for( Index row = 0; row < matrix.getRows(); row++ )
@@ -900,10 +1024,9 @@ public:
 	}
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void verifyRowPerm( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-                                   const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			  typename Index >
+	static void verifyRowPerm( const BiEllpack< Real, Device, Index >& matrix,
+                                   const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 		bool ok = true;
 		Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
@@ -947,10 +1070,9 @@ public:
 
 	template< typename Real,
 			  typename Index,
-			  int StripSize,
 			  typename InVector,
 			  typename OutVector >
-	static void vectorProduct( const BiEllpack< Real, Device, Index, StripSize >& matrix,
+	static void vectorProduct( const BiEllpack< Real, Device, Index >& matrix,
                                    const InVector& inVector,
                                    OutVector& outVector )
 	{
@@ -958,10 +1080,9 @@ public:
 	}
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void computeColumnSizes( BiEllpack< Real, Device, Index, StripSize >& matrix,
-			 	 	const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			  typename Index >
+	static void computeColumnSizes( BiEllpack< Real, Device, Index >& matrix,
+			 	 	const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 		Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
 		for( Index strip = 0; strip < numberOfStrips; strip++ )
@@ -1004,10 +1125,9 @@ public:
 	}
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void performRowBubbleSort( BiEllpack< Real, Device, Index, StripSize >& matrix,
-					  const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths
+			  typename Index >
+	static void performRowBubbleSort( BiEllpack< Real, Device, Index >& matrix,
+					  const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths
 					/*Containers::Vector< Index, Device, Index >& tempRowLengths*/ )
 	{
 		Index strips = matrix.virtualRows / matrix.warpSize;
@@ -1065,12 +1185,11 @@ public:
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 template< typename InVector,
           typename OutVector >
 __device__
-void BiEllpack< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector,
+void BiEllpack< Real, Device, Index >::spmvCuda( const InVector& inVector,
 					  	  	    OutVector& outVector,
                                                             int globalIdx ) const
 {
@@ -1123,183 +1242,13 @@ void BiEllpack< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVe
 }
 #endif
 
-/*#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-template< typename InVector,
-          typename OutVector >
-__device__
-void BiEllpack< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector,
-						                     OutVector& outVector,
-								     int globalIdx ) const
-{
-    // Loop unrolling test
-    const IndexType strip = globalIdx >> this->logWarpSize;
-    const IndexType warpStart = strip << this->logWarpSize;
-    const IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
-
-    if( warpStart >= this->getRows() )
-        return;
-
-    const IndexType cudaBlockSize = 256;
-
-    volatile Real* temp = getSharedMemory< Real >();
-    __shared__ Real results[ cudaBlockSize ];
-    results[ threadIdx.x ] = 0.0;
-    IndexType elementPtr = ( this->groupPointers[ strip * ( this->logWarpSize + 1 ) ] << this->logWarpSize ) + inWarpIdx;
-
-    //Loop Unroll #1
-    IndexType group = 0;
-    IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            results[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-    }
-
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                          - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #2
-        if( inWarpIdx < 16 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 16 )
-            results[ threadIdx.x ] += temp[ threadIdx.x ];
-        }
-
-
-    //group == 2;
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #3
-        if( inWarpIdx < 16 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 8 )
-            results[ threadIdx.x ] += temp[ threadIdx.x ];
-        }
-
-    //group == 3;
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #4
-        if( inWarpIdx < 16 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 4 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ];
-        if( inWarpIdx < 4 )
-        results[ threadIdx.x ] += temp[ threadIdx.x ];
-        }
-
-    //group == 4;
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #5
-        if( inWarpIdx < 16 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 4 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ];
-        if( inWarpIdx < 2 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ];
-        if( inWarpIdx < 2 )
-        results[ threadIdx.x ] += temp[ threadIdx.x ];
-    }
-
-    //group == 5
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #6
-        if( inWarpIdx < 16 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 4 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ];
-        if( inWarpIdx < 2 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ];
-        if( inWarpIdx < 1 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 1 ];
-        if( inWarpIdx < 1 )
-        results[ threadIdx.x ] += temp[ threadIdx.x ];
-    }
-
-    if( warpStart + inWarpIdx >= this->getRows() )
-        return;
-    outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ];
-}
-#endif*/
-
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Index,
-          int StripSize,
           typename InVector,
           typename OutVector >
 __global__
-void BiEllpackVectorProductCuda( const BiEllpack< Real, Devices::Cuda, Index, StripSize >* matrix,
+void BiEllpackVectorProductCuda( const BiEllpack< Real, Devices::Cuda, Index >* matrix,
 				 const InVector* inVector,
 				 OutVector* outVector,
 				 int gridIdx,
@@ -1313,10 +1262,9 @@ void BiEllpackVectorProductCuda( const BiEllpack< Real, Devices::Cuda, Index, St
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
+void BiEllpack< Real, Device, Index >::performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths,
 										  const IndexType strip )
 {
     IndexType begin = strip * this->warpSize;
@@ -1370,10 +1318,9 @@ void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
+void BiEllpack< Real, Device, Index >::computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths,
 										const IndexType numberOfStrips,
 										const IndexType strip )
 {
@@ -1418,11 +1365,10 @@ void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel(
 
 #ifdef HAVE_CUDA
 template< typename Real,
-          typename Index,
-          int StripSize >
+          typename Index >
 __global__
-void performRowBubbleSortCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize >* matrix,
-                               const typename BiEllpack< Real, Devices::Cuda, Index, StripSize >::CompressedRowLengthsVector* rowLengths,
+void performRowBubbleSortCuda( BiEllpack< Real, Devices::Cuda, Index >* matrix,
+                               const typename BiEllpack< Real, Devices::Cuda, Index >::CompressedRowLengthsVector* rowLengths,
                                int gridIdx )
 {
 	const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
@@ -1432,11 +1378,10 @@ void performRowBubbleSortCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize
 
 #ifdef HAVE_CUDA
 template< typename Real,
-          typename Index,
-          int StripSize >
+          typename Index >
 __global__
-void computeColumnSizesCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize >* matrix,
-                             const typename BiEllpack< Real, Devices::Cuda, Index, StripSize >::CompressedRowLengthsVector* rowLengths,
+void computeColumnSizesCuda( BiEllpack< Real, Devices::Cuda, Index >* matrix,
+                             const typename BiEllpack< Real, Devices::Cuda, Index >::CompressedRowLengthsVector* rowLengths,
                              const Index numberOfStrips,
                              int gridIdx )
 {
@@ -1453,10 +1398,9 @@ public:
 	typedef Devices::Cuda Device;
 
 	template< typename Real,
-		  typename Index,
-		  int StripSize >
-	static void verifyRowLengths( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-                                      const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+		  typename Index >
+	static void verifyRowLengths( const BiEllpack< Real, Device, Index >& matrix,
+                                      const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 		bool ok = true;
 		for( Index row = 0; row < matrix.getRows(); row++ )
@@ -1492,10 +1436,9 @@ public:
 	}
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void verifyRowPerm( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-                                   const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			  typename Index >
+	static void verifyRowPerm( const BiEllpack< Real, Device, Index >& matrix,
+                                   const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 		bool ok = true;
 		Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
@@ -1538,14 +1481,13 @@ public:
 	}
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void performRowBubbleSort( BiEllpack< Real, Device, Index, StripSize >& matrix,
-                                          const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			  typename Index >
+	static void performRowBubbleSort( BiEllpack< Real, Device, Index >& matrix,
+                                          const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 #ifdef HAVE_CUDA
-		Index numberOfStrips = matrix.virtualRows / StripSize;
-		typedef BiEllpack< Real, Devices::Cuda, Index, StripSize > Matrix;
+		Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
+		typedef BiEllpack< Real, Devices::Cuda, Index > Matrix;
 		typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector;
 		Matrix* kernel_this = Cuda::passToDevice( matrix );
 		CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths );
@@ -1555,8 +1497,8 @@ public:
 		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 		{
 		     if( gridIdx == cudaGrids - 1 )
-		         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-		     performRowBubbleSortCuda< Real, Index, StripSize >
+		         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		     performRowBubbleSortCuda< Real, Index >
 		     	 	 	 	 	 	 <<< cudaGridSize, cudaBlockSize >>>
 		                             ( kernel_this,
 		                               kernel_rowLengths,
@@ -1569,14 +1511,13 @@ public:
 	}
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void computeColumnSizes( BiEllpack< Real, Device, Index, StripSize >& matrix,
-			 	 	const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			  typename Index >
+	static void computeColumnSizes( BiEllpack< Real, Device, Index >& matrix,
+			 	 	const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 #ifdef HAVE_CUDA
-		const Index numberOfStrips = matrix.virtualRows / StripSize;
-		typedef BiEllpack< Real, Devices::Cuda, Index, StripSize > Matrix;
+		const Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
+		typedef BiEllpack< Real, Devices::Cuda, Index > Matrix;
 		typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector;
 		Matrix* kernel_this = Cuda::passToDevice( matrix );
 		CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths );
@@ -1586,8 +1527,8 @@ public:
 		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 		{
 		     if( gridIdx == cudaGrids - 1 )
-		         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-		     computeColumnSizesCuda< Real, Index, StripSize >
+		         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		     computeColumnSizesCuda< Real, Index >
 		     	 	 	 	 	   <<< cudaGridSize, cudaBlockSize >>>
 		                           ( kernel_this,
 		                             kernel_rowLengths,
@@ -1603,10 +1544,9 @@ public:
 
 	template< typename Real,
 			  typename Index,
-			  int StripSize,
 			  typename InVector,
 			  typename OutVector >
-	static void vectorProduct( const BiEllpack< Real, Device, Index, StripSize >& matrix,
+	static void vectorProduct( const BiEllpack< Real, Device, Index >& matrix,
 			   	   	   	   	   const InVector& inVector,
 			   	   	   	   	   OutVector& outVector )
 	{
@@ -1624,7 +1564,7 @@ public:
 			if( gridIdx == cudaGrids - 1 )
 				cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 			const int sharedMemory = cudaBlockSize.x * sizeof( Real );
-			BiEllpackVectorProductCuda< Real, Index, StripSize, InVector, OutVector >
+			BiEllpackVectorProductCuda< Real, Index, InVector, OutVector >
 			                                   <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
 			                                   ( kernel_this,
 			                                     kernel_inVector,
-- 
GitLab