Commit f5010d55 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Added matrix setup bechmarks for dense and multidiagonal matrix.

parent 010045a1
Loading
Loading
Loading
Loading
+29 −8
Original line number Diff line number Diff line
@@ -94,15 +94,36 @@ IF( BUILD_CUDA )
                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixViewExample_setElement.out
                       OUTPUT SparseMatrixViewExample_setElement.out )

   CUDA_ADD_EXECUTABLE( MatrixSetup_Benchmark_cuda MatrixSetup_Benchmark.cu )
   ADD_CUSTOM_COMMAND( COMMAND MatrixSetup_Benchmark_cuda >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MatrixSetup_Benchmark.out
                        OUTPUT MatrixSetup_Benchmark.out )
   CUDA_ADD_EXECUTABLE( DenseMatrixSetup_Benchmark_cuda DenseMatrixSetup_Benchmark.cu )
   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixSetup_Benchmark_cuda >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixSetup_Benchmark.out
                        OUTPUT DenseMatrixSetup_Benchmark.out )

   CUDA_ADD_EXECUTABLE( SparseMatrixSetup_Benchmark_cuda SparseMatrixSetup_Benchmark.cu )
   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixSetup_Benchmark_cuda >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixSetup_Benchmark.out
                        OUTPUT SparseMatrixSetup_Benchmark.out )

   CUDA_ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark_cuda MultidiagonalMatrixSetup_Benchmark.cu )
   ADD_CUSTOM_COMMAND( COMMAND MultidiagonalMatrixSetup_Benchmark_cuda >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MultidiagonalMatrixSetup_Benchmark.out
                        OUTPUT MultidiagonalMatrixSetup_Benchmark.out )

ELSE()
   ADD_EXECUTABLE( MatrixSetup_Benchmark MatrixSetup_Benchmark_cuda.cpp )
   ADD_CUSTOM_COMMAND( COMMAND MatrixSetup_Benchmark >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MatrixSetup_Benchmark.out
                        OUTPUT MatrixSetup_Benchmark.out )
   ADD_EXECUTABLE( DenseMatrixSetup_Benchmark DenseMatrixSetup_Benchmark_cuda.cpp )
   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixSetup_Benchmark >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixSetup_Benchmark.out
                        OUTPUT DenseMatrixSetup_Benchmark.out )

   ADD_EXECUTABLE( SparseMatrixSetup_Benchmark SparseMatrixSetup_Benchmark_cuda.cpp )
   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixSetup_Benchmark >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixSetup_Benchmark.out
                        OUTPUT SparseMatrixSetup_Benchmark.out )

   ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark MultidiagonalMatrixSetup_Benchmark_cuda.cpp )
   ADD_CUSTOM_COMMAND( COMMAND MultidiagonalMatrixSetup_Benchmark >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MultidiagonalMatrixSetup_Benchmark.out
                        OUTPUT MultidiagonalMatrixSetup_Benchmark.out )
ENDIF()

IF( BUILD_CUDA )
+123 −0
Original line number Diff line number Diff line
#include <iostream>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Matrices/SparseMatrix.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Timer.h>

const int testsCount = 5;

template< typename Matrix >
void setElement_on_host( const int matrixSize, Matrix& matrix )
{
   matrix.setDimensions( matrixSize, matrixSize );

   for( int j = 0; j < matrixSize; j++ )
      for( int i = 0; i < matrixSize; i++ )
         matrix.setElement( i, j,  i + j );
}

template< typename Matrix >
void setElement_on_device( const int matrixSize, Matrix& matrix )
{
   matrix.setDimensions( matrixSize, matrixSize );

   auto matrixView = matrix.getView();
   auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
         matrixView.setElement( i, j,  i + j );
   };
   TNL::Algorithms::ParallelFor2D< typename Matrix::DeviceType >::exec( 0, 0, matrixSize, matrixSize, f );
}

template< typename Matrix >
void getRow( const int matrixSize, Matrix& matrix )
{
   matrix.setDimensions( matrixSize, matrixSize );

   auto matrixView = matrix.getView();
   auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
      auto row = matrixView.getRow( rowIdx );
      for( int i = 0; i < matrixSize; i++ )
         row.setElement( i, rowIdx + i );
   };
   TNL::Algorithms::ParallelFor< typename Matrix::DeviceType >::exec( 0, matrixSize, f );
}

template< typename Matrix >
void forRows( const int matrixSize, Matrix& matrix )
{
   matrix.setDimensions( matrixSize, matrixSize );

   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value, bool& compute ) mutable {
      value = rowIdx + columnIdx;
   };
   matrix.forRows( 0, matrixSize, f );
}

template< typename Device >
void setupDenseMatrix()
{
   std::cout << " Dense matrix test:" << std::endl;
   for( int matrixSize = 16; matrixSize <= 8192; matrixSize *= 2 )
   {
      std::cout << "  Matrix size = " << matrixSize << std::endl;
      TNL::Timer timer;

      std::cout << "   setElement on host: ";
      timer.reset();
      timer.start();
      for( int i = 0; i < testsCount; i++ )
      {
         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
         setElement_on_host( matrixSize, matrix );
      }
      timer.stop();
      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;

      std::cout << "   setElement on device: ";
      timer.reset();
      timer.start();
      for( int i = 0; i < testsCount; i++ )
      {
         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
         setElement_on_device( matrixSize, matrix );
      }
      timer.stop();
      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;

      std::cout << "   getRow: ";
      timer.reset();
      timer.start();
      for( int i = 0; i < testsCount; i++ )
      {
         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
         getRow( matrixSize, matrix );
      }
      timer.stop();
      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;

      std::cout << "   forRows: ";
      timer.reset();
      timer.start();
      for( int i = 0; i < testsCount; i++ )
      {
         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
         forRows( matrixSize, matrix );
      }
      timer.stop();
      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
   }
}


int main( int argc, char* argv[] )
{
   std::cout << "Creating dense matrix on CPU ... " << std::endl;
   setupDenseMatrix< TNL::Devices::Host >();


#ifdef HAVE_CUDA
   std::cout << "Creating dense matrix on CUDA GPU ... " << std::endl;
   setupDenseMatrix< TNL::Devices::Cuda >();
#endif
}
+1 −0
Original line number Diff line number Diff line
DenseMatrixSetup_Benchmark.cpp
 No newline at end of file
+0 −1
Original line number Diff line number Diff line
MatrixSetup_Benchmark.cpp
 No newline at end of file
+221 −0
Original line number Diff line number Diff line
#include <iostream>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Matrices/MultidiagonalMatrix.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Timer.h>

const int testsCount = 5;

template< typename Device >
TNL::Containers::Vector< int, Device > getOffsets( const int gridSize )
{
   TNL::Containers::Vector< int, Device > offsets( 5 );
   offsets.setElement( 0, -gridSize );
   offsets.setElement( 1, -1 );
   offsets.setElement( 2, 0 );
   offsets.setElement( 3, 1 );
   offsets.setElement( 4, gridSize );
   return offsets;
}

template< typename Matrix >
void setElement_on_host( const int gridSize, Matrix& matrix )
{
   /***
    * Set  matrix representing approximation of the Laplace operator on regular
    * grid using the finite difference method by means setElement method called
    * from the host system.
    */
   const int matrixSize = gridSize * gridSize;
   matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );

   for( int j = 0; j < gridSize; j++ )
      for( int i = 0; i < gridSize; i++ )
      {
         const int rowIdx = j * gridSize + i;
         if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
            matrix.setElement( rowIdx, rowIdx,  1.0 );
         else
         {
            matrix.setElement( rowIdx, rowIdx - gridSize,  1.0 );
            matrix.setElement( rowIdx, rowIdx - 1,  1.0 );
            matrix.setElement( rowIdx, rowIdx,  -4.0 );
            matrix.setElement( rowIdx, rowIdx + 1,  1.0 );
            matrix.setElement( rowIdx, rowIdx + gridSize,  1.0 );
         }
      }
}

template< typename Matrix >
void setElement_on_device( const int gridSize, Matrix& matrix )
{
   /***
    * Set  matrix representing approximation of the Laplace operator on regular
    * grid using the finite difference method by means of setElement method called
    * from the native device.
    */
   const int matrixSize = gridSize * gridSize;
   matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );

   auto matrixView = matrix.getView();
   auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
      const int rowIdx = j * gridSize + i;
      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
         matrixView.setElement( rowIdx, rowIdx,  1.0 );
      else
      {
         matrixView.setElement( rowIdx, rowIdx - gridSize,  1.0 );
         matrixView.setElement( rowIdx, rowIdx - 1,  1.0 );
         matrixView.setElement( rowIdx, rowIdx,  -4.0 );
         matrixView.setElement( rowIdx, rowIdx + 1,  1.0 );
         matrixView.setElement( rowIdx, rowIdx + gridSize,  1.0 );
      }
   };
   TNL::Algorithms::ParallelFor2D< typename Matrix::DeviceType >::exec( 0, 0, gridSize, gridSize, f );
}

template< typename Matrix >
void getRow( const int gridSize, Matrix& matrix )
{
   /***
    * Set  matrix representing approximation of the Laplace operator on regular
    * grid using the finite difference method by means of getRow method.
    */
   const int matrixSize = gridSize * gridSize;
   matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );

   auto matrixView = matrix.getView();
   auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
      const int i = rowIdx % gridSize;
      const int j = rowIdx / gridSize;
      auto row = matrixView.getRow( rowIdx );
      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
         row.setElement( 2, 1.0 );
      else
      {
         row.setElement( 0, 1.0 );
         row.setElement( 1, 1.0 );
         row.setElement( 2, -4.0 );
         row.setElement( 3, 1.0 );
         row.setElement( 4, 1.0 );
      }
   };
   TNL::Algorithms::ParallelFor< typename Matrix::DeviceType >::exec( 0, matrixSize, f );
}

template< typename Matrix >
void forRows( const int gridSize, Matrix& matrix )
{
   /***
    * Set  matrix representing approximation of the Laplace operator on regular
    * grid using the finite difference method by means of forRows method.
    */

   const int matrixSize = gridSize * gridSize;
   matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );

   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, float& value, bool& compute ) mutable {
      const int i = rowIdx % gridSize;
      const int j = rowIdx / gridSize;
      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 && localIdx == 0 )
      {
         columnIdx = rowIdx;
         value = 1.0;
      }
      else
      {
         switch( localIdx )
         {
            case 0:
               columnIdx = rowIdx - gridSize;
               value = 1.0;
               break;
            case 1:
               columnIdx = rowIdx - 1;
               value = 1.0;
               break;
            case 2:
               columnIdx = rowIdx;
               value = -4.0;
               break;
            case 3:
               columnIdx = rowIdx + 1;
               value = 1.0;
               break;
            case 4:
               columnIdx = rowIdx + gridSize;
               value = 1.0;
               break;
         }
      }
   };
   matrix.forRows( 0, matrixSize, f );
}

template< typename Device >
void laplaceOperatorMultidiagonalMatrix()
{
   std::cout << " Sparse matrix test:" << std::endl;
   for( int gridSize = 16; gridSize <= 8192; gridSize *= 2 )
   {
      std::cout << "  Grid size = " << gridSize << std::endl;
      TNL::Timer timer;

      std::cout << "   setElement on host: ";
      timer.reset();
      timer.start();
      for( int i = 0; i < testsCount; i++ )
      {
         TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
         setElement_on_host( gridSize, matrix );
      }
      timer.stop();
      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;

      std::cout << "   setElement on device: ";
      timer.reset();
      timer.start();
      for( int i = 0; i < testsCount; i++ )
      {
         TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
         setElement_on_device( gridSize, matrix );
      }
      timer.stop();
      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;

      std::cout << "   getRow: ";
      timer.reset();
      timer.start();
      for( int i = 0; i < testsCount; i++ )
      {
         TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
         getRow( gridSize, matrix );
      }
      timer.stop();
      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;

      std::cout << "   forRows: ";
      timer.reset();
      timer.start();
      for( int i = 0; i < testsCount; i++ )
      {
         TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
         forRows( gridSize, matrix );
      }
      timer.stop();
      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;

   }
}

int main( int argc, char* argv[] )
{
   std::cout << "Creating Laplace operator matrix on CPU ... " << std::endl;
   laplaceOperatorMultidiagonalMatrix< TNL::Devices::Host >();

#ifdef HAVE_CUDA
   std::cout << "Creating Laplace operator matrix on CUDA GPU ... " << std::endl;
   laplaceOperatorMultidiagonalMatrix< TNL::Devices::Cuda >();
#endif
}
Loading