Compare revisions

Jakub Klinkovský · Vít Hanousek · Vít Hanousek · Vít Hanousek · Jakub Klinkovský · Jakub Klinkovský
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 /Debug
 /Release
 /Testing
+/Documentation
 /CMakeLists.txt.user
 /doc/_build
 /Build

--- a/Doxyfile
+++ b/Doxyfile
--- a/build
+++ b/build
@@ -223,6 +223,11 @@ else
   make_target="all"
 fi

+# make expects VERBOSE=1, ninja expects -v
+if [[ "$make" != "make" ]] && [[ "$VERBOSE" ]]; then
+   VERBOSE="-v"
+fi
+
 $make ${VERBOSE} $make_target

 if [[ ${WITH_TESTS} == "yes" ]]; then

--- a/generate-doc
+++ b/generate-doc
+#!/bin/bash
+
+PREFIX=${HOME}/.local
+INSTALL="yes"
+
+
+for option in "$@"
+do
+   case $option in
+      --prefix=*                       ) PREFIX="${option#*=}" ;;
+      --install=*                      ) INSTALL="${option#*=}" ;;
+   esac
+done
+
+doxygen
+
+if [[ "$INSTALL" == "yes" ]]; then
+   cp -r Documentation/* ${PREFIX}/share/doc
+fi
+
--- a/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
+++ b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
@@ -514,7 +514,7 @@ getExplicitUpdate( const RealType& time,
                    cell.getBasis(),
                    gridXIdx,
                    gridYIdx );
-         cudaThreadSynchronize();
+         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         
         //std::cerr << "Computing the heat equation ..." << std::endl;
@@ -534,7 +534,7 @@ getExplicitUpdate( const RealType& time,
                    cell.getBasis(),
                    gridXIdx,
                    gridYIdx );
-         cudaThreadSynchronize();         
+         cudaDeviceSynchronize();         
         TNL_CHECK_CUDA_DEVICE;
      }
      if( this->cudaKernelType == "templated" )

--- a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h
+++ b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h
@@ -47,7 +47,7 @@ int main( int argc, char* argv[] )
   while( iteration < 10000 )
   {
      testKernel< GridEntity ><<< cudaGridSize, cudaBlockSize >>>();
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
      iteration++;
   }
   auto t_stop = std::chrono::high_resolution_clock::now();   

--- a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
+++ b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
@@ -379,7 +379,7 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters,
         return false;
      }            
      
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
      cudaMemcpy( max_du, cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ), cudaMemcpyDeviceToHost );
      if( ( cudaErr = cudaGetLastError() ) != cudaSuccess )
      {

--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -250,7 +250,7 @@ double benchmarkMatrix( const Matrix& matrix,
      matrix.vectorProduct( x, b );
 #ifdef HAVE_CUDA
      if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value )
-         cudaThreadSynchronize();
+         cudaDeviceSynchronize();
 #endif
      time = timer.getRealTime();
      iterations++;

--- a/src/Python/pytnl/tnl/CMakeLists.txt
+++ b/src/Python/pytnl/tnl/CMakeLists.txt
+# enable C++14 for pytnl (due to py::overload_cast)
+set(PYBIND11_CPP_STANDARD -std=c++14)
+
 set( sources
      Grid1D.cpp
      Grid2D.cpp

--- a/src/TNL/Containers/Algorithms/Multireduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Multireduction_impl.h
@@ -179,7 +179,7 @@ reduce( Operation& operation,
   typedef typename Operation::DataType2 DataType2;
   typedef typename Operation::ResultType ResultType;

-   const int block_size = 128;
+   constexpr int block_size = 128;
   const int blocks = size / block_size;

 #ifdef HAVE_OPENMP
@@ -194,17 +194,23 @@ reduce( Operation& operation,
      }

      // initialize array for thread-local results
-      ResultType r[ n ];
-      for( int k = 0; k < n; k++ )
+      // (it is accessed as a row-major matrix with n rows and 4 columns)
+      ResultType r[ n * 4 ];
+      for( int k = 0; k < n * 4; k++ )
         r[ k ] = operation.initialValue();

      #pragma omp for nowait
      for( int b = 0; b < blocks; b++ ) {
-         const int offset = b * block_size;
+         const IndexType offset = b * block_size;
         for( int k = 0; k < n; k++ ) {
            const DataType1* _input1 = input1 + k * ldInput1;
-            for( IndexType i = 0; i < block_size; i++ )
-               operation.firstReduction( r[ k ], offset + i, _input1, input2 );
+            ResultType* _r = r + 4 * k;
+            for( int i = 0; i < block_size; i += 4 ) {
+               operation.firstReduction( _r[ 0 ], offset + i,     _input1, input2 );
+               operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 );
+               operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 );
+               operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 );
+            }
         }
      }

@@ -213,36 +219,88 @@ reduce( Operation& operation,
      {
         for( int k = 0; k < n; k++ ) {
            const DataType1* _input1 = input1 + k * ldInput1;
+            ResultType* _r = r + 4 * k;
            for( IndexType i = blocks * block_size; i < size; i++ )
-               operation.firstReduction( r[ k ], i, _input1, input2 );
+               operation.firstReduction( _r[ 0 ], i, _input1, input2 );
         }
      }

+      // local reduction of unrolled results
+      for( int k = 0; k < n; k++ ) {
+         ResultType* _r = r + 4 * k;
+         operation.commonReduction( _r[ 0 ], _r[ 1 ] );
+         operation.commonReduction( _r[ 0 ], _r[ 2 ] );
+         operation.commonReduction( _r[ 0 ], _r[ 3 ] );
+      }
+
      // inter-thread reduction of local results
      #pragma omp critical
      {
         for( int k = 0; k < n; k++ )
-            operation.commonReduction( result[ k ], r[ k ] );
+            operation.commonReduction( result[ k ], r[ 4 * k ] );
      }
   }
   else {
 #endif
-      for( int k = 0; k < n; k++ )
-         result[ k ] = operation.initialValue();
+      if( blocks > 1 ) {
+         // initialize array for unrolled results
+         // (it is accessed as a row-major matrix with n rows and 4 columns)
+         ResultType r[ n * 4 ];
+         for( int k = 0; k < n * 4; k++ )
+            r[ k ] = operation.initialValue();
+
+         // main reduction (explicitly unrolled loop)
+         for( int b = 0; b < blocks; b++ ) {
+            const IndexType offset = b * block_size;
+            for( int k = 0; k < n; k++ ) {
+               const DataType1* _input1 = input1 + k * ldInput1;
+               ResultType* _r = r + 4 * k;
+               for( int i = 0; i < block_size; i += 4 ) {
+                  operation.firstReduction( _r[ 0 ], offset + i,     _input1, input2 );
+                  operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 );
+                  operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 );
+                  operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 );
+               }
+            }
+         }

-      for( int b = 0; b < blocks; b++ ) {
-         const int offset = b * block_size;
+         // reduction of the last, incomplete block (not unrolled)
         for( int k = 0; k < n; k++ ) {
            const DataType1* _input1 = input1 + k * ldInput1;
-            for( IndexType i = 0; i < block_size; i++ )
-               operation.firstReduction( result[ k ], offset + i, _input1, input2 );
+            ResultType* _r = r + 4 * k;
+            for( IndexType i = blocks * block_size; i < size; i++ )
+               operation.firstReduction( _r[ 0 ], i, _input1, input2 );
+         }
+
+         // reduction of unrolled results
+         for( int k = 0; k < n; k++ ) {
+            ResultType* _r = r + 4 * k;
+            operation.commonReduction( _r[ 0 ], _r[ 1 ] );
+            operation.commonReduction( _r[ 0 ], _r[ 2 ] );
+            operation.commonReduction( _r[ 0 ], _r[ 3 ] );
+
+            // copy the result into the output parameter
+            result[ k ] = _r[ 0 ];
         }
      }
+      else {
+         for( int k = 0; k < n; k++ )
+            result[ k ] = operation.initialValue();

-      for( int k = 0; k < n; k++ ) {
-         const DataType1* _input1 = input1 + k * ldInput1;
-         for( IndexType i = blocks * block_size; i < size; i++ )
-            operation.firstReduction( result[ k ], i, _input1, input2 );
+         for( int b = 0; b < blocks; b++ ) {
+            const IndexType offset = b * block_size;
+            for( int k = 0; k < n; k++ ) {
+               const DataType1* _input1 = input1 + k * ldInput1;
+               for( int i = 0; i < block_size; i++ )
+                  operation.firstReduction( result[ k ], offset + i, _input1, input2 );
+            }
+         }
+
+         for( int k = 0; k < n; k++ ) {
+            const DataType1* _input1 = input1 + k * ldInput1;
+            for( IndexType i = blocks * block_size; i < size; i++ )
+               operation.firstReduction( result[ k ], i, _input1, input2 );
+         }
      }
 #ifdef HAVE_OPENMP
   }

--- a/src/TNL/Containers/Algorithms/Reduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Reduction_impl.h
@@ -179,22 +179,22 @@ reduce( Operation& operation,
   typedef typename Operation::DataType2 DataType2;
   typedef typename Operation::ResultType ResultType;

-#ifdef HAVE_OPENMP
   constexpr int block_size = 128;
+   const int blocks = size / block_size;
+
+#ifdef HAVE_OPENMP
   if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
      // global result variable
      ResultType result = operation.initialValue();
 #pragma omp parallel
      {
-         const int blocks = size / block_size;
-
         // initialize array for thread-local results
         ResultType r[ 4 ] = { operation.initialValue() };

         #pragma omp for nowait
         for( int b = 0; b < blocks; b++ ) {
-            const int offset = b * block_size;
-            for( IndexType i = 0; i < block_size; i += 4 ) {
+            const IndexType offset = b * block_size;
+            for( int i = 0; i < block_size; i += 4 ) {
               operation.firstReduction( r[ 0 ], offset + i,     input1, input2 );
               operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
               operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
@@ -209,7 +209,7 @@ reduce( Operation& operation,
               operation.firstReduction( r[ 0 ], i, input1, input2 );
         }

-         // reduction of local results
+         // local reduction of unrolled results
         operation.commonReduction( r[ 0 ], r[ 1 ] );
         operation.commonReduction( r[ 0 ], r[ 2 ] );
         operation.commonReduction( r[ 0 ], r[ 3 ] );
@@ -224,10 +224,38 @@ reduce( Operation& operation,
   }
   else {
 #endif
-      ResultType result = operation.initialValue();
-      for( IndexType i = 0; i < size; i++ )
-         operation.firstReduction( result, i, input1, input2 );
-      return result;
+      if( blocks > 1 ) {
+         // initialize array for unrolled results
+         ResultType r[ 4 ] = { operation.initialValue() };
+
+         // main reduction (explicitly unrolled loop)
+         for( int b = 0; b < blocks; b++ ) {
+            const IndexType offset = b * block_size;
+            for( int i = 0; i < block_size; i += 4 ) {
+               operation.firstReduction( r[ 0 ], offset + i,     input1, input2 );
+               operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
+               operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
+               operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
+            }
+         }
+
+         // reduction of the last, incomplete block (not unrolled)
+         for( IndexType i = blocks * block_size; i < size; i++ )
+            operation.firstReduction( r[ 0 ], i, input1, input2 );
+
+         // reduction of unrolled results
+         operation.commonReduction( r[ 0 ], r[ 1 ] );
+         operation.commonReduction( r[ 0 ], r[ 2 ] );
+         operation.commonReduction( r[ 0 ], r[ 3 ] );
+
+         return r[ 0 ];
+      }
+      else {
+         ResultType result = operation.initialValue();
+         for( IndexType i = 0; i < size; i++ )
+            operation.firstReduction( result, i, input1, input2 );
+         return result;
+      }
 #ifdef HAVE_OPENMP
   }
 #endif

--- a/src/TNL/Matrices/Ellpack_impl.h
+++ b/src/TNL/Matrices/Ellpack_impl.h
@@ -933,7 +933,7 @@ class EllpackDeviceDependentCode< Devices::Cuda >
            //Devices::Cuda::freeFromDevice( kernel_inVector );
            //Devices::Cuda::freeFromDevice( kernel_outVector );
            TNL_CHECK_CUDA_DEVICE;
-            cudaThreadSynchronize();
+            cudaDeviceSynchronize();
         #endif
 
      }

--- a/src/TNL/Matrices/SlicedEllpack_impl.h
+++ b/src/TNL/Matrices/SlicedEllpack_impl.h
@@ -1051,7 +1051,7 @@ class SlicedEllpackDeviceDependentCode< Devices::Cuda >
            //Devices::Cuda::freeFromDevice( kernel_inVector );
            //Devices::Cuda::freeFromDevice( kernel_outVector );
            TNL_CHECK_CUDA_DEVICE;
-            cudaThreadSynchronize();
+            cudaDeviceSynchronize();
         #endif
      }


--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
@@ -57,7 +57,8 @@ class DistributedGridIO<Functions::MeshFunction<MeshType>,LocalCopy,Device>
        newMesh->setOrigin(origin+TNL::Containers::Scale(spaceSteps,localBegin));
        
        File meshFile;
-        meshFile.open( fileName+String("-mesh-")+distrGrid->printProcessCoords()+String(".tnl"),IOMode::write);
+        bool ok=meshFile.open( fileName+String("-mesh-")+distrGrid->printProcessCoords()+String(".tnl"),IOMode::write);
+        TNL_ASSERT_TRUE(ok,"Not able to open mesh file to write");
        newMesh->save( meshFile );
        meshFile.close();

@@ -72,7 +73,8 @@ class DistributedGridIO<Functions::MeshFunction<MeshType>,LocalCopy,Device>
        CopyEntitiesHelper<MeshFunctionType>::Copy(meshFunction,newMeshFunction,localBegin,zeroCoord,localSize);

        File file;
-        file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), IOMode::write );
+        ok=file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), IOMode::write );
+        TNL_ASSERT_TRUE(ok,"Not able to open file to write");
        bool ret=newMeshFunction.save(file);
        file.close();

@@ -110,7 +112,8 @@ class DistributedGridIO<Functions::MeshFunction<MeshType>,LocalCopy,Device>
        zeroCoord.setValue(0);        

        File file;
-        file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), IOMode::read );
+        bool ok=file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), IOMode::read );
+        TNL_ASSERT_TRUE(ok,"Not able to open file to read");
        bool result=newMeshFunction.boundLoad(file);
        file.close();
        CopyEntitiesHelper<MeshFunctionType>::Copy(newMeshFunction,meshFunction,zeroCoord,localBegin,localSize);
@@ -151,12 +154,13 @@ class DistributedGridIO_MPIIOBase
       MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));

 	   MPI_File file;
-       MPI_File_open( group,
+      int ok=MPI_File_open( group,
                      const_cast< char* >( fileName.getString() ),
                      MPI_MODE_CREATE | MPI_MODE_WRONLY,
                      MPI_INFO_NULL,
                      &file);
-
+      TNL_ASSERT_EQ(ok,0,"Open file falied");
+      
 		int written=save(file,meshFunction, data,0);

        MPI_File_close(&file);
@@ -322,13 +326,14 @@ class DistributedGridIO_MPIIOBase
        MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));

        MPI_File file;
-        MPI_File_open( group,
+        int ok=MPI_File_open( group,
                      const_cast< char* >( fileName.getString() ),
                      MPI_MODE_RDONLY,
                      MPI_INFO_NULL,
                      &file );
+        TNL_ASSERT_EQ(ok,0,"Open file falied");

-		bool ret= load(file, meshFunction, data,0)>0;
+		  bool ret= load(file, meshFunction, data,0)>0;

        MPI_File_close(&file);


--- a/src/TNL/Solvers/ODE/Euler_impl.h
+++ b/src/TNL/Solvers/ODE/Euler_impl.h
@@ -204,7 +204,7 @@ void Euler< Problem > :: computeNewTimeLevel( DofVectorPointer& u,
                                                                      &_u[ gridOffset ],
                                                                      this->cudaBlockResidue.getData() );
         localResidue += this->cudaBlockResidue.sum();
-         cudaThreadSynchronize();
+         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
      }
 #endif

--- a/src/TNL/Solvers/ODE/Merson_impl.h
+++ b/src/TNL/Solvers/ODE/Merson_impl.h
@@ -305,7 +305,7 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
      const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;

      this->problem->getExplicitUpdate( time, tau, u, k1 );
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();

      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
      {
@@ -313,10 +313,10 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
         computeK2Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_kAux[ gridOffset ] );
      }
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
      this->problem->applyBoundaryConditions( time + tau_3, kAux );
      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 );
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();

      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
      {
@@ -324,10 +324,10 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
         computeK3Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k2[ gridOffset ], &_kAux[ gridOffset ] );
      }
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
      this->problem->applyBoundaryConditions( time + tau_3, kAux );
      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 );
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();

      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
      {
@@ -335,10 +335,10 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
         computeK4Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_kAux[ gridOffset ] );
      }
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
      this->problem->applyBoundaryConditions( time + 0.5 * tau, kAux );
      this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 );
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();

      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
      {
@@ -346,10 +346,10 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
         computeK5Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_k4[ gridOffset ], &_kAux[ gridOffset ] );
      }
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
      this->problem->applyBoundaryConditions( time + tau, kAux );
      this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 );
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
 #endif
   }
 }
@@ -409,7 +409,7 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t
                                                              &_k4[ gridOffset ],
                                                              &_k5[ gridOffset ],
                                                              &_kAux[ gridOffset ] );
-         cudaThreadSynchronize();
+         cudaDeviceSynchronize();
         eps = std::max( eps, kAux->max() );
      }
 #endif
@@ -468,7 +468,7 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time,
                                                                       &_u[ gridOffset ],
                                                                       this->cudaBlockResidue.getData() );
         localResidue += this->cudaBlockResidue.sum();
-         cudaThreadSynchronize();
+         cudaDeviceSynchronize();
      }
      this->problem->applyBoundaryConditions( time, u );


--- a/src/UnitTests/Containers/CMakeLists.txt
+++ b/src/UnitTests/Containers/CMakeLists.txt
@@ -66,6 +66,20 @@ ELSE(  BUILD_CUDA )
                              tnl )
 ENDIF( BUILD_CUDA )

+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE( MultireductionTest MultireductionTest.cu
+                        OPTIONS ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( MultireductionTest
+                              ${GTEST_BOTH_LIBRARIES}
+                              tnl )
+ELSE(  BUILD_CUDA )
+   ADD_EXECUTABLE( MultireductionTest MultireductionTest.cpp )
+   TARGET_COMPILE_OPTIONS( MultireductionTest PRIVATE ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( MultireductionTest
+                              ${GTEST_BOTH_LIBRARIES}
+                              tnl )
+ENDIF( BUILD_CUDA )
+
 ADD_EXECUTABLE( StaticVectorTest StaticVectorTest.cpp )
 TARGET_COMPILE_OPTIONS( StaticVectorTest PRIVATE ${CXX_TESTS_FLAGS} )
 TARGET_LINK_LIBRARIES( StaticVectorTest
@@ -93,6 +107,7 @@ ADD_TEST( ArrayTest ${EXECUTABLE_OUTPUT_PATH}/ArrayTest${CMAKE_EXECUTABLE_SUFFIX
 ADD_TEST( ArrayViewTest ${EXECUTABLE_OUTPUT_PATH}/ArrayViewTest${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( StaticArrayTest ${EXECUTABLE_OUTPUT_PATH}/StaticArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( VectorTest ${EXECUTABLE_OUTPUT_PATH}/VectorTest${CMAKE_EXECUTABLE_SUFFIX} )
+ADD_TEST( MultireductionTest ${EXECUTABLE_OUTPUT_PATH}/MultireductionTest${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( StaticVectorTest ${EXECUTABLE_OUTPUT_PATH}/StaticVectorTest${CMAKE_EXECUTABLE_SUFFIX} )
 #ADD_TEST( MultiArrayTest ${EXECUTABLE_OUTPUT_PATH}/MultiArrayTest${CMAKE_EXECUTABLE_SUFFIX} )


--- a/src/UnitTests/Containers/MultireductionTest.cpp
+++ b/src/UnitTests/Containers/MultireductionTest.cpp
+#include "MultireductionTest.h"
--- a/src/UnitTests/Containers/MultireductionTest.cu
+++ b/src/UnitTests/Containers/MultireductionTest.cu
+#include "MultireductionTest.h"
--- a/src/UnitTests/Containers/MultireductionTest.h
+++ b/src/UnitTests/Containers/MultireductionTest.h
+/***************************************************************************
+                          MultireductionTest.h  -  description
+                             -------------------
+    begin                : Oct 1, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#ifdef HAVE_GTEST
+#include "gtest/gtest.h"
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Containers/Algorithms/Multireduction.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Containers::Algorithms;
+
+template< typename View >
+void setLinearSequence( View& deviceVector )
+{
+   using HostVector = Containers::Vector< typename View::RealType, Devices::Host, typename View::IndexType >;
+   HostVector a;
+   a.setLike( deviceVector );
+   for( int i = 0; i < a.getSize(); i++ )
+      a[ i ] = i;
+   deviceVector = a;
+}
+
+template< typename View >
+void setNegativeLinearSequence( View& deviceVector )
+{
+   using HostVector = Containers::Vector< typename View::RealType, Devices::Host, typename View::IndexType >;
+   HostVector a;
+   a.setLike( deviceVector );
+   for( int i = 0; i < a.getSize(); i++ )
+      a[ i ] = -i;
+   deviceVector = a;
+}
+
+// test fixture for typed tests
+template< typename Vector >
+class MultireductionTest : public ::testing::Test
+{
+protected:
+   using DeviceVector = Vector;
+   using DeviceView = VectorView< typename Vector::RealType, typename Vector::DeviceType, typename Vector::IndexType >;
+   using HostVector = typename DeviceVector::HostType;
+   using HostView = typename DeviceView::HostType;
+
+   // should be small enough to have fast tests, but larger than minGPUReductionDataSize
+   // and large enough to require multiple CUDA blocks for reduction
+   static constexpr int size = 5000;
+
+   // number of vectors which are reduced together
+   static constexpr int n = 4;
+
+   DeviceVector V;
+   DeviceVector y;
+   HostVector result;
+
+   MultireductionTest()
+   {
+      V.setSize( size * n );
+      y.setSize( size );
+      result.setSize( n );
+
+      for( int i = 0; i < n; i++ ) {
+         DeviceView v( &V[ i * size ], size );
+         if( i % 2 == 0 )
+            setLinearSequence( v );
+         else
+            setNegativeLinearSequence( v );
+      }
+      y.setValue( 1 );
+   }
+};
+
+// types for which MultireductionTest is instantiated
+using VectorTypes = ::testing::Types<
+   Vector< int,   Devices::Host >,
+   Vector< float, Devices::Host >
+#ifdef HAVE_CUDA
+   ,
+   Vector< int,   Devices::Cuda >,
+   Vector< float, Devices::Cuda >
+#endif
+>;
+
+TYPED_TEST_CASE( MultireductionTest, VectorTypes );
+
+TYPED_TEST( MultireductionTest, scalarProduct )
+{
+   using RealType = typename TestFixture::DeviceVector::RealType;
+   using DeviceType = typename TestFixture::DeviceVector::DeviceType;
+
+   ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+   Multireduction< DeviceType >::reduce
+               ( scalarProduct,
+                 this->n,
+                 this->size,
+                 this->V.getData(),
+                 this->size,
+                 this->y.getData(),
+                 this->result.getData() );
+
+   for( int i = 0; i < this->n; i++ ) {
+      if( i % 2 == 0 )
+         EXPECT_EQ( this->result[ i ], 0.5 * this->size * ( this->size - 1 ) );
+      else
+         EXPECT_EQ( this->result[ i ], - 0.5 * this->size * ( this->size - 1 ) );
+   }
+}
+
+#endif // HAVE_GTEST
+
+
+#include "../GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+   return RUN_ALL_TESTS();
+#else
+   throw GtestMissingError();
+#endif
+}
No results found