Loading src/Benchmarks/GEM/Makefile.inc +2 −2 Original line number Diff line number Diff line Loading @@ -6,8 +6,8 @@ TNL_HEADERS = ${HOME}/.local/include INSTALL_DIR = ${HOME}/.local WITH_CUDA = yes WITH_MPI = yes WITH_OPENMP = yes WITH_DEBUG = yes WITH_OPENMP = no WITH_DEBUG = no MPI_FLAGT = MPI_FLAGS = #MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi Loading src/Benchmarks/GEM/Matrix/Matrix.h +8 −0 Original line number Diff line number Diff line Loading @@ -20,6 +20,7 @@ class Matrix typedef Real RealType; typedef TNL::Containers::Vector< Index, Device, Index > CompressedRowLengthsVector; typedef TNL::Containers::Vector< Real, Device, Index > Vector; typedef TNL::Containers::Vector< Real, TNL::Devices::Host, Index > VectorHost; typedef TNL::Containers::VectorView< Real, Device, Index > VectorView; typedef TNL::Containers::VectorView< Real, TNL::Devices::Host, Index > HostVectorView; typedef TNL::Containers::VectorView< Real, TNL::Devices::Cuda, Index > DeviceVectorView; Loading @@ -28,6 +29,13 @@ class Matrix Matrix( const Index rows, const Index columns ); Matrix( Matrix< Real, Device, Index>& matrix ); /** * Reset of matrix to 0 rows, 0 columns and 0 data memory. * * @return void */ void reset(); /** * Sets dimension for matrix of rows x columns elements and allocates memory * for data array of this matrix. Loading src/Benchmarks/GEM/Matrix/Matrix_impl.h +37 −7 Original line number Diff line number Diff line Loading @@ -25,6 +25,18 @@ Matrix< Real, Device, Index >::Matrix( Matrix< Real, Device, Index>& matrix ) matrix.getData( this->data ); } template < typename Real, typename Device, typename Index > void Matrix< Real, Device, Index >::reset( ) { this->rows = 1; this->columns = 1; this->data.reset(); this->data.setSize( 1 ); } template < typename Real, typename Device, Loading @@ -40,8 +52,8 @@ void Matrix< Real, Device, Index >::setDimensions( const Index rows, const Index #if DEBUG printf("We are making host array!\n"); #endif data.setSize( rows*columns ); data.setValue( 0 ); this->data.setSize( rows*columns ); this->data.setValue( 0 ); } #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) Loading Loading @@ -321,27 +333,45 @@ Matrix<Real, Device, Index >& Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix ) { this->setDimensions( matrix.getNumRows(), matrix.getNumColumns() ); Vector pom; matrix.getData( pom ); if( std::is_same< Device, Device2 >::value ) { Vector pom; matrix.getData( pom ); this->data = pom; } #ifdef HAVE_CUDA else if( std::is_same< Device, TNL::Devices::Host >::value ) { VectorHost pom; matrix.getData( pom ); VectorHost pom1; pom1.setSize( this->data.getSize() ); #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( int i = 0; i < this->rows; i++ ) for( int j = 0; j < this->columns; j++ ) this->setElement(i,j, pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ] ); pom1[ i * this->columns + j ] = pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ]; this->data = pom1; } else if( std::is_same< Device, TNL::Devices::Cuda >::value ) { VectorHost pom; matrix.getData( pom ); VectorHost pom1; pom1.setSize( this->data.getSize() ); #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( int i = 0; i < this->getNumRows(); i++ ) for( int j = 0; j < this->getNumColumns(); j++ ) { this->data.setElement( i*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + j, pom.getElement( i * this->columns + j ) ); pom1[ i*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + j ] = pom[ i * this->columns + j]; } this->data = pom1; } #endif // HAVE_CUDA return *this; Loading src/Benchmarks/GEM/gem.h +50 −46 Original line number Diff line number Diff line Loading @@ -3,18 +3,18 @@ #include "Matrix/Matrix.h" #include "gem/GEM.h" #include "TNL/tnl-dev/src/TNL/Math.h" #include <TNL/Math.h> #include <typeinfo> // type printf #include <TNL/Matrices/MatrixReader.h> #include "TNL/tnl-dev/src/TNL/Cuda/CheckDevice.h" #include "TNL/tnl-dev/src/TNL/Devices/Cuda.h" #include "TNL/tnl-dev/src/TNL/Containers/Vector.h" #include "TNL/tnl-dev/src/TNL/Cuda/MemoryHelpers.h" #include "TNL/tnl-dev/src/TNL/Devices/Host.h" #include <TNL/Cuda/CheckDevice.h> #include <TNL/Devices/Cuda.h> #include <TNL/Containers/Vector.h> #include <TNL/Cuda/MemoryHelpers.h> #include <TNL/Devices/Host.h> #ifdef HAVE_MPI #include "TNL/tnl-dev/src/TNL/Communicators/MpiCommunicator.h" #include <TNL/Communicators/MpiCommunicator.h> #include <mpi.h> #endif Loading @@ -40,9 +40,9 @@ template < typename Real, typename Index > void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName ); template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, template < typename Real, typename Index > void readMatrixVector( Matrix< Real, Devices::Host, Index>& matrix, Vector< Real, Devices::Host, Index >& vector, const String& matrixName, const String& vectorName, Index &rows, Index &nonzeros, const int verbose ); Loading @@ -55,17 +55,22 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve { typedef Matrix< Real, Device, Index > MatrixType; typedef Vector< Real, Device, Index > VectorType; typedef Matrix< Real, Devices::Host, Index > MatrixTypeHost; typedef Vector< Real, Devices::Host, Index > VectorTypeHost; int processID = 0; // MPI processID, without mpi == 0 #ifdef HAVE_MPI MPI_Comm_rank( MPI_COMM_WORLD, &processID ); Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif MatrixTypeHost matrixHost; VectorTypeHost vectorHost; MatrixType matrix; VectorType vector; VectorType vectorResult; Index rows, nonzeros; readMatrixVector( matrix, vector, matrixName, vectorName, rows, nonzeros, verbose ); VectorType vectorResult( rows ); readMatrixVector( matrixHost, vectorHost, matrixName, vectorName, rows, nonzeros, verbose ); // Computation double* time; Loading @@ -80,10 +85,14 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif MatrixType matrixComp = matrix; VectorType vectorComp( vector ); //readMatrixVector( matrixHost, vectorHost, matrixName, vectorName, rows, nonzeros, verbose ); matrix.reset(); vector.reset(); matrix = matrixHost; vector = vectorHost; vectorResult.setSize( rows ); vectorResult.setValue( 0 ); GEM< Real, Device, Index > gem( matrixComp, vectorComp ); GEM< Real, Device, Index > gem( matrix, vector ); #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); Loading Loading @@ -125,15 +134,16 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve timeMean /= loops; if( processID == 0 ) printf("%20s %15d %15d %20s %15s %15s %10d %15.5f %15.5f\n", matrixName.c_str(), rows, nonzeros, vectorName == "none" ? "-":vectorName.c_str(), device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, timeMean, error); printf("%20s %15s %15s %10d %20s & %15d & %15.3f & %15.3f\n", vectorName == "none" ? "-":vectorName.c_str(), device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, matrixName.c_str(), rows, timeMean, error); delete []time; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif matrix.reset(); if( processID == 0 ){ //printf("%d: returning\n", processID ); return vectorResult; Loading @@ -146,47 +156,39 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve } } template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, template < typename Real, typename Index > void readMatrixVector( Matrix< Real, Devices::Host, Index>& matrix, Vector< Real, Devices::Host, Index >& vector, const String& matrixName, const String& vectorName, Index &rows, Index &nonzeros, const int verbose ) { typedef Matrix< Real, Devices::Host, Index> MatrixHost; MatrixHost matrixHost; if( verbose > 1 ) cout << "reading matrix " << matrixName << endl; // Get matrix Matrices::MatrixReader< MatrixHost > m; m.readMtxFile( "./test-matrices/" + matrixName, matrixHost, verbose ); rows = matrixHost.getNumRows(); nonzeros = matrixHost.getNumNonzeros(); Matrices::MatrixReader< Matrix< Real, Devices::Host, Index> > m; m.readMtxFile( "./test-matrices/" + matrixName, matrix, verbose ); rows = matrix.getNumRows(); nonzeros = matrix.getNumNonzeros(); vector.setSize( rows ); // Get vector Vector< Real, Devices::Host, Index > vectorHost( matrixHost.getNumRows() ); if( vectorName == "none" ) calculHostVecOne( matrixHost, vectorHost, vectorName ); else if( vectorName == "none" ){ calculHostVecOne( matrix, vector, vectorName ); }else { readVector( vectorHost, vectorName ); readVector( vector, vectorName ); if( verbose > 1 ) cout << "reading vector " << vectorName << endl; } #ifdef HAVE_MPI cutMatrixVectorMPI( matrixHost, vectorHost ); cutMatrixVectorMPI( matrix, vector ); #endif if( verbose > 2 ) matrixHost.showMatrix(); matrix.showMatrix(); if( verbose > 2 ) cout << vectorHost << endl; // Copy from CPU into matrix dependent on template Device vector = vectorHost; matrix = matrixHost; cout << vector << endl; } template < typename Real, typename Index > Loading @@ -200,7 +202,7 @@ void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix, { pom += matrix.getElement(i,j); } vector[i] = pom; vector.setElement( i, pom ); } /*ofstream outdata; // outdata is like cin Loading Loading @@ -275,6 +277,8 @@ void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix, matrixTemp.setElement(i,j,0); } } //if( j % 1000 == 0 ) // printf("%d: cutting col %d\n", processID, j ); } for( int i = 0; i < numRowsCUT; i++ ){ if( i + numRowsCUT * processID < matrix.getNumRows() ){ Loading @@ -283,7 +287,7 @@ void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix, vectorTemp.setElement( i, 0 ); } } matrix.reset(); vector.reset(); matrix = matrixTemp; vector = vectorTemp; } Loading src/Benchmarks/GEM/gem/GEMdevice.h +1 −1 Original line number Diff line number Diff line #include "TNL/Cuda/MemoryHelpers.h" #include <TNL/Cuda/MemoryHelpers.h> #define DEBUG 0 Loading Loading
src/Benchmarks/GEM/Makefile.inc +2 −2 Original line number Diff line number Diff line Loading @@ -6,8 +6,8 @@ TNL_HEADERS = ${HOME}/.local/include INSTALL_DIR = ${HOME}/.local WITH_CUDA = yes WITH_MPI = yes WITH_OPENMP = yes WITH_DEBUG = yes WITH_OPENMP = no WITH_DEBUG = no MPI_FLAGT = MPI_FLAGS = #MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi Loading
src/Benchmarks/GEM/Matrix/Matrix.h +8 −0 Original line number Diff line number Diff line Loading @@ -20,6 +20,7 @@ class Matrix typedef Real RealType; typedef TNL::Containers::Vector< Index, Device, Index > CompressedRowLengthsVector; typedef TNL::Containers::Vector< Real, Device, Index > Vector; typedef TNL::Containers::Vector< Real, TNL::Devices::Host, Index > VectorHost; typedef TNL::Containers::VectorView< Real, Device, Index > VectorView; typedef TNL::Containers::VectorView< Real, TNL::Devices::Host, Index > HostVectorView; typedef TNL::Containers::VectorView< Real, TNL::Devices::Cuda, Index > DeviceVectorView; Loading @@ -28,6 +29,13 @@ class Matrix Matrix( const Index rows, const Index columns ); Matrix( Matrix< Real, Device, Index>& matrix ); /** * Reset of matrix to 0 rows, 0 columns and 0 data memory. * * @return void */ void reset(); /** * Sets dimension for matrix of rows x columns elements and allocates memory * for data array of this matrix. Loading
src/Benchmarks/GEM/Matrix/Matrix_impl.h +37 −7 Original line number Diff line number Diff line Loading @@ -25,6 +25,18 @@ Matrix< Real, Device, Index >::Matrix( Matrix< Real, Device, Index>& matrix ) matrix.getData( this->data ); } template < typename Real, typename Device, typename Index > void Matrix< Real, Device, Index >::reset( ) { this->rows = 1; this->columns = 1; this->data.reset(); this->data.setSize( 1 ); } template < typename Real, typename Device, Loading @@ -40,8 +52,8 @@ void Matrix< Real, Device, Index >::setDimensions( const Index rows, const Index #if DEBUG printf("We are making host array!\n"); #endif data.setSize( rows*columns ); data.setValue( 0 ); this->data.setSize( rows*columns ); this->data.setValue( 0 ); } #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) Loading Loading @@ -321,27 +333,45 @@ Matrix<Real, Device, Index >& Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix ) { this->setDimensions( matrix.getNumRows(), matrix.getNumColumns() ); Vector pom; matrix.getData( pom ); if( std::is_same< Device, Device2 >::value ) { Vector pom; matrix.getData( pom ); this->data = pom; } #ifdef HAVE_CUDA else if( std::is_same< Device, TNL::Devices::Host >::value ) { VectorHost pom; matrix.getData( pom ); VectorHost pom1; pom1.setSize( this->data.getSize() ); #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( int i = 0; i < this->rows; i++ ) for( int j = 0; j < this->columns; j++ ) this->setElement(i,j, pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ] ); pom1[ i * this->columns + j ] = pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ]; this->data = pom1; } else if( std::is_same< Device, TNL::Devices::Cuda >::value ) { VectorHost pom; matrix.getData( pom ); VectorHost pom1; pom1.setSize( this->data.getSize() ); #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( int i = 0; i < this->getNumRows(); i++ ) for( int j = 0; j < this->getNumColumns(); j++ ) { this->data.setElement( i*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + j, pom.getElement( i * this->columns + j ) ); pom1[ i*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + j ] = pom[ i * this->columns + j]; } this->data = pom1; } #endif // HAVE_CUDA return *this; Loading
src/Benchmarks/GEM/gem.h +50 −46 Original line number Diff line number Diff line Loading @@ -3,18 +3,18 @@ #include "Matrix/Matrix.h" #include "gem/GEM.h" #include "TNL/tnl-dev/src/TNL/Math.h" #include <TNL/Math.h> #include <typeinfo> // type printf #include <TNL/Matrices/MatrixReader.h> #include "TNL/tnl-dev/src/TNL/Cuda/CheckDevice.h" #include "TNL/tnl-dev/src/TNL/Devices/Cuda.h" #include "TNL/tnl-dev/src/TNL/Containers/Vector.h" #include "TNL/tnl-dev/src/TNL/Cuda/MemoryHelpers.h" #include "TNL/tnl-dev/src/TNL/Devices/Host.h" #include <TNL/Cuda/CheckDevice.h> #include <TNL/Devices/Cuda.h> #include <TNL/Containers/Vector.h> #include <TNL/Cuda/MemoryHelpers.h> #include <TNL/Devices/Host.h> #ifdef HAVE_MPI #include "TNL/tnl-dev/src/TNL/Communicators/MpiCommunicator.h" #include <TNL/Communicators/MpiCommunicator.h> #include <mpi.h> #endif Loading @@ -40,9 +40,9 @@ template < typename Real, typename Index > void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName ); template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, template < typename Real, typename Index > void readMatrixVector( Matrix< Real, Devices::Host, Index>& matrix, Vector< Real, Devices::Host, Index >& vector, const String& matrixName, const String& vectorName, Index &rows, Index &nonzeros, const int verbose ); Loading @@ -55,17 +55,22 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve { typedef Matrix< Real, Device, Index > MatrixType; typedef Vector< Real, Device, Index > VectorType; typedef Matrix< Real, Devices::Host, Index > MatrixTypeHost; typedef Vector< Real, Devices::Host, Index > VectorTypeHost; int processID = 0; // MPI processID, without mpi == 0 #ifdef HAVE_MPI MPI_Comm_rank( MPI_COMM_WORLD, &processID ); Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif MatrixTypeHost matrixHost; VectorTypeHost vectorHost; MatrixType matrix; VectorType vector; VectorType vectorResult; Index rows, nonzeros; readMatrixVector( matrix, vector, matrixName, vectorName, rows, nonzeros, verbose ); VectorType vectorResult( rows ); readMatrixVector( matrixHost, vectorHost, matrixName, vectorName, rows, nonzeros, verbose ); // Computation double* time; Loading @@ -80,10 +85,14 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif MatrixType matrixComp = matrix; VectorType vectorComp( vector ); //readMatrixVector( matrixHost, vectorHost, matrixName, vectorName, rows, nonzeros, verbose ); matrix.reset(); vector.reset(); matrix = matrixHost; vector = vectorHost; vectorResult.setSize( rows ); vectorResult.setValue( 0 ); GEM< Real, Device, Index > gem( matrixComp, vectorComp ); GEM< Real, Device, Index > gem( matrix, vector ); #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); Loading Loading @@ -125,15 +134,16 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve timeMean /= loops; if( processID == 0 ) printf("%20s %15d %15d %20s %15s %15s %10d %15.5f %15.5f\n", matrixName.c_str(), rows, nonzeros, vectorName == "none" ? "-":vectorName.c_str(), device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, timeMean, error); printf("%20s %15s %15s %10d %20s & %15d & %15.3f & %15.3f\n", vectorName == "none" ? "-":vectorName.c_str(), device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, matrixName.c_str(), rows, timeMean, error); delete []time; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif matrix.reset(); if( processID == 0 ){ //printf("%d: returning\n", processID ); return vectorResult; Loading @@ -146,47 +156,39 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve } } template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, template < typename Real, typename Index > void readMatrixVector( Matrix< Real, Devices::Host, Index>& matrix, Vector< Real, Devices::Host, Index >& vector, const String& matrixName, const String& vectorName, Index &rows, Index &nonzeros, const int verbose ) { typedef Matrix< Real, Devices::Host, Index> MatrixHost; MatrixHost matrixHost; if( verbose > 1 ) cout << "reading matrix " << matrixName << endl; // Get matrix Matrices::MatrixReader< MatrixHost > m; m.readMtxFile( "./test-matrices/" + matrixName, matrixHost, verbose ); rows = matrixHost.getNumRows(); nonzeros = matrixHost.getNumNonzeros(); Matrices::MatrixReader< Matrix< Real, Devices::Host, Index> > m; m.readMtxFile( "./test-matrices/" + matrixName, matrix, verbose ); rows = matrix.getNumRows(); nonzeros = matrix.getNumNonzeros(); vector.setSize( rows ); // Get vector Vector< Real, Devices::Host, Index > vectorHost( matrixHost.getNumRows() ); if( vectorName == "none" ) calculHostVecOne( matrixHost, vectorHost, vectorName ); else if( vectorName == "none" ){ calculHostVecOne( matrix, vector, vectorName ); }else { readVector( vectorHost, vectorName ); readVector( vector, vectorName ); if( verbose > 1 ) cout << "reading vector " << vectorName << endl; } #ifdef HAVE_MPI cutMatrixVectorMPI( matrixHost, vectorHost ); cutMatrixVectorMPI( matrix, vector ); #endif if( verbose > 2 ) matrixHost.showMatrix(); matrix.showMatrix(); if( verbose > 2 ) cout << vectorHost << endl; // Copy from CPU into matrix dependent on template Device vector = vectorHost; matrix = matrixHost; cout << vector << endl; } template < typename Real, typename Index > Loading @@ -200,7 +202,7 @@ void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix, { pom += matrix.getElement(i,j); } vector[i] = pom; vector.setElement( i, pom ); } /*ofstream outdata; // outdata is like cin Loading Loading @@ -275,6 +277,8 @@ void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix, matrixTemp.setElement(i,j,0); } } //if( j % 1000 == 0 ) // printf("%d: cutting col %d\n", processID, j ); } for( int i = 0; i < numRowsCUT; i++ ){ if( i + numRowsCUT * processID < matrix.getNumRows() ){ Loading @@ -283,7 +287,7 @@ void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix, vectorTemp.setElement( i, 0 ); } } matrix.reset(); vector.reset(); matrix = matrixTemp; vector = vectorTemp; } Loading
src/Benchmarks/GEM/gem/GEMdevice.h +1 −1 Original line number Diff line number Diff line #include "TNL/Cuda/MemoryHelpers.h" #include <TNL/Cuda/MemoryHelpers.h> #define DEBUG 0 Loading