From 18aeeef8988d03f3c3ae24c83f664dce17110226 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 24 Dec 2020 18:40:01 +0100
Subject: [PATCH 1/4] CMakeLists.txt: suppress another useless nvcc warning
 from CUDA 11.2

---
 CMakeLists.txt | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 874d39f6a..2a59a4117 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -215,6 +215,10 @@ if( ${WITH_CUDA} )
         #   reference for the -Xcudafe --diag_suppress and --display_error_number flags: https://stackoverflow.com/a/54142937
         #   incomplete list of tokens: http://www.ssl.berkeley.edu/~jimm/grizzly_docs/SSL/opt/intel/cc/9.0/lib/locale/en_US/mcpcom.msg
         set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -Xcudafe --diag_suppress=code_is_unreachable -Xcudafe --diag_suppress=loop_not_reachable -Xcudafe --diag_suppress=implicit_return_from_non_void_function -Xcudafe --diag_suppress=unsigned_compare_with_zero -Xcudafe --diag_suppress=2906 -Xcudafe --diag_suppress=2913 -Xcudafe --diag_suppress=2886 -Xcudafe --diag_suppress=2929 -Xcudafe --diag_suppress=2977 -Xcudafe --diag_suppress=3057 -Xcudafe --diag_suppress=3124 -Xcudafe --display_error_number)
+        if(CUDA_VERSION_STRING VERSION_GREATER_EQUAL "11.2")
+           # this diag number would cause an error on older nvcc
+           set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Xcudafe --diag_suppress=20012)
+        endif()
         # temporarily disable host-compler warnings about VLAs, which are caused by nvcc's modifications to the source code
         set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Xcompiler -Wno-vla)
         # Select GPU architecture
@@ -263,7 +267,7 @@ if( JPEG_FOUND )
 endif()
 
 ####
-# Test for GMP 
+# Test for GMP
 #
 if( ${WITH_GMP} )
    if (GMP_INCLUDES AND GMP_LIBRARIES)
@@ -301,7 +305,7 @@ endif()
 #     DOC "PETSC headers."
 #   )
 #   if( ${PETSC_INCLUDE_DIR} STREQUAL "PETSC_INCLUDE_DIR-NOTFOUND" )
-#      message( "PETSC not found." ) 
+#      message( "PETSC not found." )
 #   else()
 #      message( "PETSC headers found -- ${PETSC_INCLUDE_DIR}" )
 #      FIND_LIBRARY(PETSC_LIBRARY petsc
@@ -313,7 +317,7 @@ endif()
 #         #set( PETSC_LIBRARY "${MPI_LIBRARIES} ${PETSC_LIBRARY}")
 #         message( "PETSC library found -- ${PETSC_LIBRARY}")
 #         list( GET MPI_CXX_INCLUDE_PATH 0 MPI_CXX_PATH )
-#         set(PETSC_CXX_FLAGS "-DHAVE_PETSC -I${PETSC_INCLUDE_DIR} -DHAVE_MPI -I${MPI_CXX_PATH}")                     
+#         set(PETSC_CXX_FLAGS "-DHAVE_PETSC -I${PETSC_INCLUDE_DIR} -DHAVE_MPI -I${MPI_CXX_PATH}")
 #      endif()
 #   endif()
 #endif()
-- 
GitLab


From 64e6732390041e01f00a583385436ef9acb61d1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 25 Dec 2020 01:28:54 +0100
Subject: [PATCH 2/4] Removed useless assert from DenseMatrixView

The condition is always satisfied, since getAllocatedElementsCount
returns the size of the values vector view, which was bound just prior
the assert.
---
 src/TNL/Matrices/DenseMatrixView.hpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index c76e0c1f6..d7a781e20 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -40,13 +40,6 @@ DenseMatrixView( const IndexType rows,
                  const ValuesViewType& values )
  : MatrixView< Real, Device, Index >( rows, columns, values )
 {
-#ifdef __CUDA_ARCH__
-   TNL_ASSERT_EQ( values.getSize(), this->getAllocatedElementsCount(), "Number of matrix elements does not agree with matrix dimensions." );
-#else
-   if( values.getSize() != this->getAllocatedElementsCount() )
-      throw( std::logic_error( "Number of matrix elements does not agree with matrix dimensions." ) );
-#endif
-
    SegmentsType a( rows, columns );
    segments = a.getView();
 }
-- 
GitLab


From 0a5550bb29e16af1daf2bad0b8f7fcd7462b7118 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 26 Dec 2020 19:11:17 +0100
Subject: [PATCH 3/4] CMakeLists.txt: cleaned up nvcc flags

---
 CMakeLists.txt | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2a59a4117..05a0fd0b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -210,15 +210,13 @@ if( ${WITH_CUDA} )
                set( CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} )
             endif()
         endif()
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ;-DHAVE_CUDA)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA --expt-relaxed-constexpr --expt-extended-lambda)
         # disable false compiler warnings
         #   reference for the -Xcudafe --diag_suppress and --display_error_number flags: https://stackoverflow.com/a/54142937
         #   incomplete list of tokens: http://www.ssl.berkeley.edu/~jimm/grizzly_docs/SSL/opt/intel/cc/9.0/lib/locale/en_US/mcpcom.msg
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -Xcudafe --diag_suppress=code_is_unreachable -Xcudafe --diag_suppress=loop_not_reachable -Xcudafe --diag_suppress=implicit_return_from_non_void_function -Xcudafe --diag_suppress=unsigned_compare_with_zero -Xcudafe --diag_suppress=2906 -Xcudafe --diag_suppress=2913 -Xcudafe --diag_suppress=2886 -Xcudafe --diag_suppress=2929 -Xcudafe --diag_suppress=2977 -Xcudafe --diag_suppress=3057 -Xcudafe --diag_suppress=3124 -Xcudafe --display_error_number)
-        if(CUDA_VERSION_STRING VERSION_GREATER_EQUAL "11.2")
-           # this diag number would cause an error on older nvcc
-           set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Xcudafe --diag_suppress=20012)
-        endif()
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Wno-deprecated-gpu-targets -Xcudafe --diag_suppress=code_is_unreachable -Xcudafe --diag_suppress=loop_not_reachable -Xcudafe --diag_suppress=implicit_return_from_non_void_function -Xcudafe --diag_suppress=unsigned_compare_with_zero -Xcudafe --display_error_number)
+        # This diagnostic is just plain wrong in CUDA 9 and later, see https://github.com/kokkos/kokkos/issues/1470
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored)
         # temporarily disable host-compler warnings about VLAs, which are caused by nvcc's modifications to the source code
         set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Xcompiler -Wno-vla)
         # Select GPU architecture
-- 
GitLab


From 6bf315c3c4d91eb80ecfa8ca525b0833a89c3522 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 26 Dec 2020 12:04:55 +0100
Subject: [PATCH 4/4] Improved test of vectorProduct in the SparseMatrixTest

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 92 +++++++++++----------
 1 file changed, 48 insertions(+), 44 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 41d5025a0..7c0d831a8 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -1310,51 +1310,55 @@ void test_VectorProduct()
 
    /////
    // Large test
-   if( ( std::is_same< IndexType, int >::value || std::is_same< IndexType, long int >::value ) &&
-      std::is_same< RealType, double >::value )
+   const IndexType size( 35 );
+   //for( int size = 1; size < 1000; size++ )
    {
-      const IndexType size( 35 );
-      //for( int size = 1; size < 1000; size++ )
-      {
-         //std::cerr << " size = " << size << std::endl;
-         // Test with large diagonal matrix
-         Matrix m1( size, size );
-         TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowCapacities( size );
-         rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return 1; } );
-         m1.setRowCapacities( rowCapacities );
-         auto f1 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
-            if( localIdx == 0  )
-            {
-               value = row + 1;
-               column = row;
-            }
-         };
-         m1.forAllRows( f1 );
-         TNL::Containers::Vector< double, DeviceType, IndexType > in( size, 1.0 ), out( size, 0.0 );
-         m1.vectorProduct( in, out );
-         //std::cerr << out << std::endl;
-         for( IndexType i = 0; i < size; i++ )
-            EXPECT_EQ( out.getElement( i ), i + 1 );
-
-         // Test with large triangular matrix
-         Matrix m2( size, size );
-         rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return i + 1; } );
-         m2.setRowCapacities( rowCapacities );
-         auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
-            if( localIdx <= row )
-            {
-               value = row -localIdx + 1;
-               column = localIdx;
-            }
-         };
-         m2.forAllRows( f2 );
-         out = 0.0;
-         m2.vectorProduct( in, out );
-         //std::cerr << out << std::endl;
-         for( IndexType i = 0; i < size; i++ )
-            EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 );
-         
-      }
+      //std::cerr << " size = " << size << std::endl;
+      // Test with large diagonal matrix
+      Matrix m1( size, size );
+      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowCapacities( size );
+      rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return 1; } );
+      m1.setRowCapacities( rowCapacities );
+      auto f1 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
+         if( localIdx == 0  )
+         {
+            value = row + 1;
+            column = row;
+         }
+      };
+      m1.forAllRows( f1 );
+      // check that the matrix was initialized
+      m1.getCompressedRowLengths( rowCapacities );
+      EXPECT_EQ( rowCapacities, 1 );
+
+      TNL::Containers::Vector< double, DeviceType, IndexType > in( size, 1.0 ), out( size, 0.0 );
+      m1.vectorProduct( in, out );
+      //std::cerr << out << std::endl;
+      for( IndexType i = 0; i < size; i++ )
+         EXPECT_EQ( out.getElement( i ), i + 1 );
+
+      // Test with large triangular matrix
+      Matrix m2( size, size );
+      rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return i + 1; } );
+      m2.setRowCapacities( rowCapacities );
+      auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
+         if( localIdx <= row )
+         {
+            value = row -localIdx + 1;
+            column = localIdx;
+         }
+      };
+      m2.forAllRows( f2 );
+      // check that the matrix was initialized
+      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( size );
+      m2.getCompressedRowLengths( rowLengths );
+      EXPECT_EQ( rowLengths, rowCapacities );
+
+      out = 0.0;
+      m2.vectorProduct( in, out );
+      //std::cerr << out << std::endl;
+      for( IndexType i = 0; i < size; i++ )
+         EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 );
    }
 }
 
-- 
GitLab