From f4bab3c9bec2d594dc9ae583977c97fd16596b5d Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 24 Jun 2019 23:28:39 +0200
Subject: [PATCH] Implemented fix for case when roundToMultiple would give a
 number smaller than the number of rows, thus causing index to be out of
 bounds on CUDA

---
 src/TNL/Matrices/EllpackSymmetricGraph_impl.h | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
index 9b12ab1f70..1aa9b51a6c 100644
--- a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
@@ -73,13 +73,27 @@ void EllpackSymmetricGraph< Real, Device, Index >::setDimensions( const IndexTyp
    TNL_ASSERT( rows > 0 && columns > 0,
               std::cerr << "rows = " << rows
                    << " columns = " << columns << std::endl );
+   
    this->rows = rows;
-   this->columns = columns;   
+   this->columns = columns;
+   
    if( std::is_same< DeviceType, Devices::Cuda >::value )
-      this->alignedRows = roundToMultiple( columns, Cuda::getWarpSize() );
+   {
+       this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
+       
+       if( this->rows - this->alignedRows > 0 )
+       {
+           IndexType missingRows = this->rows - this->alignedRows;
+           missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() );
+           this->alignedRows +=  missingRows;
+           
+//           this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() );
+       }
+   }
    else this->alignedRows = rows;
+   
    if( this->rowLengths != 0 )
-   allocateElements();
+       allocateElements();
 }
 
 template< typename Real,
-- 
GitLab