From 4e1309ce0ab8b17502dc836da61a75a0d6d46ae0 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 24 Jun 2019 23:28:19 +0200
Subject: [PATCH] Implemented fix for case when roundToMultiple would give a
 number smaller than the number of rows, thus causing index to be out of
 bounds on CUDA

---
 src/TNL/Matrices/EllpackSymmetric_impl.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Matrices/EllpackSymmetric_impl.h b/src/TNL/Matrices/EllpackSymmetric_impl.h
index fa8952b4ff..f64cef4c5b 100644
--- a/src/TNL/Matrices/EllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetric_impl.h
@@ -57,13 +57,27 @@ void EllpackSymmetric< Real, Device, Index >::setDimensions( const IndexType row
    TNL_ASSERT( rows > 0 && columns > 0,
              std::cerr << "rows = " << rows
                    << " columns = " << columns <<std::endl );
+      
    this->rows = rows;
-   this->columns = columns;   
+   this->columns = columns;
+   
    if( std::is_same< DeviceType, Devices::Cuda >::value )
-      this->alignedRows = roundToMultiple( columns, Cuda::getWarpSize() );
+   {
+       this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
+       
+       if( this->rows - this->alignedRows > 0 )
+       {
+           IndexType missingRows = this->rows - this->alignedRows;
+           missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() );
+           this->alignedRows +=  missingRows;
+           
+//           this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() );
+       }
+   }
    else this->alignedRows = rows;
+   
    if( this->rowLengths != 0 )
-      allocateElements();
+       allocateElements();
 }
 
 template< typename Real,
-- 
GitLab