Loading src/Benchmarks/GEM/Matrix/Matrix_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -161,7 +161,7 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix matrix.getData( pom ); if( std::is_same< Device, Device2 >::value ) { printf("copy host to host or device to device\n"); //printf("copy host to host or device to device\n"); this->data = pom; } #ifdef HAVE_CUDA Loading src/Benchmarks/GEM/gem.h 0 → 100644 +212 −0 Original line number Diff line number Diff line #include "Matrix/Matrix.h" #include "gem/GEM.h" #include "TNL/tnl-dev/src/TNL/Math.h" //#include "gem/GEMdevice.h" #include <TNL/Matrices/MatrixReader.h> #include "TNL/tnl-dev/src/TNL/Cuda/CheckDevice.h" #include "TNL/tnl-dev/src/TNL/Devices/Cuda.h" #include "TNL/tnl-dev/src/TNL/Containers/Vector.h" #include "TNL/tnl-dev/src/TNL/Cuda/MemoryHelpers.h" #include "TNL/tnl-dev/src/TNL/Devices/Host.h" #define COMPARE_RESULTS true; // Prihodit GEMdeice do GEM a dělat dělení na device až tam -> přidat parametr pivoting using namespace TNL; using namespace TNL::Containers; using namespace std; template < typename Real, typename Index > void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix, Vector< Real, Devices::Host, Index >& vector, const String& vectorName ); template < typename Real, typename Index > void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName ); template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, const String& matrixName, const String& vectorName ); template < typename Real, typename Index, typename Device > Vector< Real, Device, Index > runGEM( const String& matrixName, const String& vectorName, const int loops, const int verbose, const String& device, const String& pivoting ) { typedef Matrix< Real, Device, Index > MatrixType; typedef Vector< Real, Device, Index > VectorType; MatrixType matrix; VectorType vector; readMatrixVector( matrix, vector, matrixName, vectorName ); VectorType vectorResult( matrix.getNumRows() ); // Computation double* time; time = new double[ loops ]; std::cout << "Starting computation on " << device << endl; for( int i = 0; i < loops; i++ ) { MatrixType matrixComp = matrix; VectorType vectorComp( vector ); vectorComp.setValue( 0 ); vectorResult.setValue( 0 ); GEM< Real, Device, Index > gem( matrixComp, vectorComp ); std::clock_t start; double duration; start = std::clock(); cout << "starting computation number " << i+1 << endl; gem.solve( vectorResult, pivoting, 0 ); duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; time[i] = duration; double l2norm = 0; for( int j = 0; j < matrix.getNumRows(); j++ ) l2norm += (vectorResult.getElement( i ) - 1)*(vectorResult.getElement( i ) - 1); l2norm = std::sqrt(l2norm); printf( " %.4f ", l2norm); } printf("\n ... done!\n"); delete []time; return vectorResult; /* //#ifdef HAVE_CUDA #ifdef COMPARE_RESULTS double error = 0.0; //std::cout << "Results:\nrow:"<< setw(20) << "Host" << setw(20) << "Device" << setw(20) << "Error" << std::endl; for( int i = 0; i < matrix.getNumRows(); i++ ) { double errorPom = ( result_vector.getElement(i) - result_vector_dev.getElement(i) ) * ( result_vector.getElement(i) - result_vector_dev.getElement(i) ); //std::cout << i << ": " << setw(20) << result_vector.getElement(i) << setw(20) << result_vector_dev.getElement(i) << setw(20) << errorPom << std::endl; error += errorPom; } error = std::sqrt(error); printf("Difference in L2 norm from Device and Host is %.8f\n", error ); double CPUmean(0), GPUmean(0); cout << "Timers: " << endl << "CPU: [ "; for( int i = 0; i < loops; i++ ) { CPUmean += timeCPU[i]; GPUmean += timeGPU[i]; cout << timeCPU[i] << " "; } CPUmean = CPUmean/loops; GPUmean = GPUmean/loops; cout << "]" << endl; cout << "GPU: [ "; for( int i = 0; i < loops; i++ ) cout << timeGPU[i] << " "; cout << "]" << endl; cout << "CPU mean time: " << CPUmean << endl; cout << "GPU mean time: " << GPUmean << endl; #endif #endif delete []timeCPU; #ifdef HAVE_CUDA delete []timeGPU; #endif return;*/ } template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, const String& matrixName, const String& vectorName ) { typedef Matrix< Real, Devices::Host, Index> MatrixHost; MatrixHost matrixHost; // Get matrix Matrices::MatrixReader< MatrixHost > m; m.readMtxFile( "./test-matrices/" + matrixName, matrixHost ); cout << "reading matrix " << matrixName << endl; //matrixHost.showMatrix(); matrix = matrixHost; // Get vector Vector< Real, Devices::Host, Index > vectorHost( matrixHost.getNumRows() ); if( vectorName == "none" ) calculHostVecOne( matrixHost, vectorHost, vectorName ); else readVector( vectorHost, vectorName ); cout << "reading vector " << vectorName << endl; vector = vectorHost; } template < typename Real, typename Index > void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix, Vector< Real, Devices::Host, Index >& vector, const String& vectorName ) { for( int i = 0; i < matrix.getNumRows(); i++ ) { Real pom = 0; for( int j = 0; j < matrix.getNumColumns(); j++ ) { pom += matrix.getElement(i,j); } vector[i] = pom; } cout << endl; ofstream outdata; // outdata is like cin outdata.open("./test-matrices/" + vectorName ); // opens the file if( !outdata ) { // file couldn't be opened cerr << "Error: file could not be opened" << endl; exit(1); } for( int i = 0; i < vector.getSize(); i++ ) { outdata << vector[i] << endl; } outdata.close(); cout << endl; } template < typename Real, typename Index > void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName ) { ifstream inFile; Real x; inFile.open("./test-matrices/" + vectorName ); if (!inFile) { cout << "Unable to open file" << endl; return; } int i = 0; while ( inFile >> x ) { vector_host[i] = x; i++; } inFile.close(); } No newline at end of file src/Benchmarks/GEM/gem-CPU/GEM.h→src/Benchmarks/GEM/gem/GEM.h +52 −0 Original line number Diff line number Diff line Loading @@ -9,26 +9,33 @@ #include <ostream> template < typename Real = double, typename Device = TNL::Devices::Host, typename Index = int > template < typename Device > class GEMDeviceDependentCode; template < typename Real, typename Device, typename Index > class GEM { public: typedef Device DeviceType; typedef Matrix< Real, Device, Index > MatrixGEM; typedef TNL::Containers::Array< Real, Device, Index > Array; typedef TNL::Containers::Vector< Real, Device, Index > Array; GEM( MatrixGEM& A, Array& b ); bool solve( Array& x, int verbose = 0 ); bool solve( Array& x, const TNL::String& pivoting, int verbose = 0 ); bool solveWithoutPivoting( Array& x, int verbose = 0 ); bool solveWithPivoting( Array& x, int verbose = 0 ); bool computeLUDecomposition( int verbose = 0 ); bool GEMdevice( Array& x, const TNL::String& pivoting, int verbose ); protected: void print( std::ostream& str = std::cout ) const; Loading @@ -36,6 +43,9 @@ class GEM MatrixGEM A; Array b; typedef GEMDeviceDependentCode< DeviceType > DeviceDependentCode; friend class GEMDeviceDependentCode< DeviceType >; }; #include "GEM_impl.h" Loading src/Benchmarks/GEM/gem-CPU/GEM_impl.h→src/Benchmarks/GEM/gem/GEM_impl.h +44 −29 Original line number Diff line number Diff line Loading @@ -13,6 +13,8 @@ #pragma once #include <TNL/Assert.h> #include "GEMdevice.h" #include "GEM.h" #include <fstream> template< typename Real, Loading @@ -22,11 +24,20 @@ GEM< Real, Device, Index >::GEM( MatrixGEM& A, Array& b ) : A(A), b(b) {} template< typename Real, typename Device, typename Index > bool GEM< Real, Device, Index >::solve( Array& x, const TNL::String& pivoting, int verbose ) { printf("in solve\n"); return DeviceDependentCode::SolveGEM( *this, x, pivoting, verbose ); } template< typename Real, typename Device, typename Index > bool GEM< Real, Device, Index >::solve( Array& x, int verbose ) bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose ) { assert( b.getSize() == x.getSize() ); Loading @@ -40,6 +51,7 @@ bool GEM< Real, Device, Index >::solve( Array& x, int verbose ) /**** * Divide the k-th row by pivot */ if( verbose > 1 ) if( k % 10 == 0 ) std::cout << "Elimination: " << k << "/" << n << std::endl; const Real& pivot = this->A.getElement( k, k ); Loading Loading @@ -106,7 +118,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose ) for( int k = 0; k < n; k++ ) { if( verbose > 1 ) std::cout << "Step " << k << "/" << n << ".... \r"; std::cout << "Step " << k << "/" << n << "....\n"; //"\r"; /**** * Find the pivot - the largest in k-th row */ Loading Loading @@ -255,35 +267,38 @@ void GEM< Real, Device, Index >::print( std::ostream& str ) const str << " | " << std::setprecision( precision ) << std::setw( precision + 6 ) << b[ row ] << " |" << std::endl; } } #ifdef HAVE_CUDA /*template < typename Real, typename Index >*/ __global__ void GEMKernel( /*Matrix< Real, Devices::Cuda, Index >& matrix, Containers::Array< Real, Devices::Cuda, Index >& b, Containers::Array< Real, Devices::Cuda, Index >& x, int verbose*/ ) template <> class GEMDeviceDependentCode< TNL::Devices::Host > { public: typedef TNL::Devices::Host DeviceType; template < typename Real, typename Index> static bool SolveGEM( GEM< Real, DeviceType, Index>& gem, TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose ) { unsigned int i = threadIdx.x;// + blockDim.x * blockIdx.x; printf( "%d\n" , i ); printf("starting computation on CPU\n"); return pivoting == "yes" ? gem.solveWithPivoting( x, verbose ) : gem.solveWithoutPivoting(x, verbose ); } }; #endif template <> class GEMDeviceDependentCode< TNL::Devices::Cuda > { public: typedef TNL::Devices::Cuda DeviceType; /*template< typename Real, typename Device, typename Index > bool Matrix< Real, Device, Index >::solveCuda( Array& b, Array& x, int verbose ) template < typename Real, typename Index> static bool SolveGEM( GEM< Real, DeviceType, Index>& gem, TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose ) { #ifdef HAVE_CUDA Index numRows = this->A.getNumRows(); Index numColumns = this->A.getNumColumns(); Index numThreads = 32; Index numBlocks = roundUpDivision( numRows, numThreads ); // TODO: num of blocks needed higher than num of blocks able to calculate Devices::Cuda::synchronizeDevice(); GEMKernel<<< 1, 1 >>>( this, b, x, verbose ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("starting computation on GPU\n"); gem.GEMdevice( x, pivoting, verbose ); #endif }*/ return true; } }; src/Benchmarks/GEM/gem-GPU/GEMdevice.h→src/Benchmarks/GEM/gem/GEMdevice.h +44 −53 Original line number Diff line number Diff line #include "GEMkernels.h" #include "TNL/Cuda/MemoryHelpers.h" #define DEBUG 0 #ifdef HAVE_CUDA #include "GEMkernels.h" template < typename Real, typename Index > void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev, void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& result_vector_dev ) TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& x ) { Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrixDev); Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrix); int blockSize = matrixDev.getNumRows() > 1024 ? 1024 : matrixDev.getNumColumns(); int numBlocksOnColumn = TNL::roundUpDivision( matrixDev.getNumRows(), 1024 ); int numOfBlocks = matrixDev.getNumRows() * numBlocksOnColumn; int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumColumns(); int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 ); int numOfBlocks = matrix.getNumRows() * numBlocksOnColumn; GEMDiagToResult<<< numOfBlocks, blockSize >>>( devMat,device_vector.getView(),result_vector_dev.getView() ); GEMDiagToResult<<< numOfBlocks, blockSize >>>( devMat,device_vector.getView(), x.getView() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } Loading @@ -27,28 +27,27 @@ void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev template < typename Real, typename Device, typename Index > void GEMdevice( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& result_vector_dev ) bool GEM<Real, Device, Index >::GEMdevice( /*Matrix< Real, TNL::Devices::Cuda, Index >& this->A, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, */ Array& x, const TNL::String& pivoting, int verbose ) { Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrixDev); Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( this->A ); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector( this->b ); // FOR PIVOTING SET VARIABLES ON DEVICE size_t size = sizeof(int); int* pivot; cudaMalloc(&pivot, size); for( int colPointer = 0; colPointer < matrixDev.getNumColumns(); colPointer++ ) for( int colPointer = 0; colPointer < this->A.getNumColumns(); colPointer++ ) { int reduceBlockSize = (matrixDev.getNumColumns()-colPointer) > 512 ? 512 : TNL::roundToMultiple( matrixDev.getNumColumns()-colPointer, 32 ); int reduceGridSize = TNL::roundUpDivision( matrixDev.getNumColumns()-colPointer, reduceBlockSize ); // PIVOTING int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 512 ? 512 : TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize ); int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 ); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSizeRound); Loading @@ -68,63 +67,55 @@ void GEMdevice( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev, //printf("%d: position of pivot: %d\n", colPointer, *pom); int blockSize = (matrixDev.getNumColumns()-colPointer) > 1024 ? 1024 : matrixDev.getNumColumns(); int numBlocksOnRow = TNL::roundUpDivision( (matrixDev.getNumColumns()-colPointer), 1024 ); int numOfBlocks = matrixDev.getNumRows() * numBlocksOnRow; int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; //printf( "%d number of threads, %d number of blocks\n", blockSize, numOfBlocks); if( *pom != -1 && *pom != colPointer ) { swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot ); } /*showMatrix<<< 1, 1 >>>( matrixDev ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE;*/ TNL_CHECK_CUDA_DEVICE; } /*int* pom = (int*)malloc(size); *pom = 0; cudaMemcpy(pivot, pom, size, cudaMemcpyHostToDevice); findPivot<<< numBlocksOnRow, 1024 >>>( devMat, colPointer, numBlocksOnRow ); /*printf("\n"); showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n");*/ findRowPivot<<< numBlocksOnRow, blockSize >>>( devMat, colPointer, numBlocksOnRow, d_max, pivot ); GEMColumnUnderDiag<<< numOfBlocks, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaMemcpy( pom, pivot, size, cudaMemcpyDeviceToHost); //printf("%d: position of pivot: %d\n", colPointer, *pom); if( *pom != -1 && *pom != colPointer ) { swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot ); }*/ /*cudaMemcpy( pom, d_max, size, cudaMemcpyDeviceToHost); printf("%d\n", *pom );*/ GEMColumnUnderDiag<<< numOfBlocks, blockSize >>>( devMat, /*int blockSize1 = this->A.getNumRows() > 1024 ? 1024 : this->A.getNumRows(); int gridSize1 = TNL::roundUpDivision( this->A.getNumRows(), blockSize1 ); GEMZeroingMainColumn<<< gridSize1, blockSize1 >>>(devMat, device_vector.getView(), colPointer, numBlocksOnRow ); colPointer ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; /*printf("\n"); showMatrix<<< 1, 1 >>>( matrixDev ); printf("\n"); showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n");*/ printf("\n"); std::cout << device_vector << std::endl;*/ } cudaFree(pivot); //std::cout << device_vector << std::endl; calculateResultSeqCPU( matrixDev, device_vector, result_vector_dev ); calculateResultSeqCPU( this->A, device_vector, x ); return true; } #endif // HAVE_CUDA No newline at end of file Loading
src/Benchmarks/GEM/Matrix/Matrix_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -161,7 +161,7 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix matrix.getData( pom ); if( std::is_same< Device, Device2 >::value ) { printf("copy host to host or device to device\n"); //printf("copy host to host or device to device\n"); this->data = pom; } #ifdef HAVE_CUDA Loading
src/Benchmarks/GEM/gem.h 0 → 100644 +212 −0 Original line number Diff line number Diff line #include "Matrix/Matrix.h" #include "gem/GEM.h" #include "TNL/tnl-dev/src/TNL/Math.h" //#include "gem/GEMdevice.h" #include <TNL/Matrices/MatrixReader.h> #include "TNL/tnl-dev/src/TNL/Cuda/CheckDevice.h" #include "TNL/tnl-dev/src/TNL/Devices/Cuda.h" #include "TNL/tnl-dev/src/TNL/Containers/Vector.h" #include "TNL/tnl-dev/src/TNL/Cuda/MemoryHelpers.h" #include "TNL/tnl-dev/src/TNL/Devices/Host.h" #define COMPARE_RESULTS true; // Prihodit GEMdeice do GEM a dělat dělení na device až tam -> přidat parametr pivoting using namespace TNL; using namespace TNL::Containers; using namespace std; template < typename Real, typename Index > void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix, Vector< Real, Devices::Host, Index >& vector, const String& vectorName ); template < typename Real, typename Index > void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName ); template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, const String& matrixName, const String& vectorName ); template < typename Real, typename Index, typename Device > Vector< Real, Device, Index > runGEM( const String& matrixName, const String& vectorName, const int loops, const int verbose, const String& device, const String& pivoting ) { typedef Matrix< Real, Device, Index > MatrixType; typedef Vector< Real, Device, Index > VectorType; MatrixType matrix; VectorType vector; readMatrixVector( matrix, vector, matrixName, vectorName ); VectorType vectorResult( matrix.getNumRows() ); // Computation double* time; time = new double[ loops ]; std::cout << "Starting computation on " << device << endl; for( int i = 0; i < loops; i++ ) { MatrixType matrixComp = matrix; VectorType vectorComp( vector ); vectorComp.setValue( 0 ); vectorResult.setValue( 0 ); GEM< Real, Device, Index > gem( matrixComp, vectorComp ); std::clock_t start; double duration; start = std::clock(); cout << "starting computation number " << i+1 << endl; gem.solve( vectorResult, pivoting, 0 ); duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; time[i] = duration; double l2norm = 0; for( int j = 0; j < matrix.getNumRows(); j++ ) l2norm += (vectorResult.getElement( i ) - 1)*(vectorResult.getElement( i ) - 1); l2norm = std::sqrt(l2norm); printf( " %.4f ", l2norm); } printf("\n ... done!\n"); delete []time; return vectorResult; /* //#ifdef HAVE_CUDA #ifdef COMPARE_RESULTS double error = 0.0; //std::cout << "Results:\nrow:"<< setw(20) << "Host" << setw(20) << "Device" << setw(20) << "Error" << std::endl; for( int i = 0; i < matrix.getNumRows(); i++ ) { double errorPom = ( result_vector.getElement(i) - result_vector_dev.getElement(i) ) * ( result_vector.getElement(i) - result_vector_dev.getElement(i) ); //std::cout << i << ": " << setw(20) << result_vector.getElement(i) << setw(20) << result_vector_dev.getElement(i) << setw(20) << errorPom << std::endl; error += errorPom; } error = std::sqrt(error); printf("Difference in L2 norm from Device and Host is %.8f\n", error ); double CPUmean(0), GPUmean(0); cout << "Timers: " << endl << "CPU: [ "; for( int i = 0; i < loops; i++ ) { CPUmean += timeCPU[i]; GPUmean += timeGPU[i]; cout << timeCPU[i] << " "; } CPUmean = CPUmean/loops; GPUmean = GPUmean/loops; cout << "]" << endl; cout << "GPU: [ "; for( int i = 0; i < loops; i++ ) cout << timeGPU[i] << " "; cout << "]" << endl; cout << "CPU mean time: " << CPUmean << endl; cout << "GPU mean time: " << GPUmean << endl; #endif #endif delete []timeCPU; #ifdef HAVE_CUDA delete []timeGPU; #endif return;*/ } template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, const String& matrixName, const String& vectorName ) { typedef Matrix< Real, Devices::Host, Index> MatrixHost; MatrixHost matrixHost; // Get matrix Matrices::MatrixReader< MatrixHost > m; m.readMtxFile( "./test-matrices/" + matrixName, matrixHost ); cout << "reading matrix " << matrixName << endl; //matrixHost.showMatrix(); matrix = matrixHost; // Get vector Vector< Real, Devices::Host, Index > vectorHost( matrixHost.getNumRows() ); if( vectorName == "none" ) calculHostVecOne( matrixHost, vectorHost, vectorName ); else readVector( vectorHost, vectorName ); cout << "reading vector " << vectorName << endl; vector = vectorHost; } template < typename Real, typename Index > void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix, Vector< Real, Devices::Host, Index >& vector, const String& vectorName ) { for( int i = 0; i < matrix.getNumRows(); i++ ) { Real pom = 0; for( int j = 0; j < matrix.getNumColumns(); j++ ) { pom += matrix.getElement(i,j); } vector[i] = pom; } cout << endl; ofstream outdata; // outdata is like cin outdata.open("./test-matrices/" + vectorName ); // opens the file if( !outdata ) { // file couldn't be opened cerr << "Error: file could not be opened" << endl; exit(1); } for( int i = 0; i < vector.getSize(); i++ ) { outdata << vector[i] << endl; } outdata.close(); cout << endl; } template < typename Real, typename Index > void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName ) { ifstream inFile; Real x; inFile.open("./test-matrices/" + vectorName ); if (!inFile) { cout << "Unable to open file" << endl; return; } int i = 0; while ( inFile >> x ) { vector_host[i] = x; i++; } inFile.close(); } No newline at end of file
src/Benchmarks/GEM/gem-CPU/GEM.h→src/Benchmarks/GEM/gem/GEM.h +52 −0 Original line number Diff line number Diff line Loading @@ -9,26 +9,33 @@ #include <ostream> template < typename Real = double, typename Device = TNL::Devices::Host, typename Index = int > template < typename Device > class GEMDeviceDependentCode; template < typename Real, typename Device, typename Index > class GEM { public: typedef Device DeviceType; typedef Matrix< Real, Device, Index > MatrixGEM; typedef TNL::Containers::Array< Real, Device, Index > Array; typedef TNL::Containers::Vector< Real, Device, Index > Array; GEM( MatrixGEM& A, Array& b ); bool solve( Array& x, int verbose = 0 ); bool solve( Array& x, const TNL::String& pivoting, int verbose = 0 ); bool solveWithoutPivoting( Array& x, int verbose = 0 ); bool solveWithPivoting( Array& x, int verbose = 0 ); bool computeLUDecomposition( int verbose = 0 ); bool GEMdevice( Array& x, const TNL::String& pivoting, int verbose ); protected: void print( std::ostream& str = std::cout ) const; Loading @@ -36,6 +43,9 @@ class GEM MatrixGEM A; Array b; typedef GEMDeviceDependentCode< DeviceType > DeviceDependentCode; friend class GEMDeviceDependentCode< DeviceType >; }; #include "GEM_impl.h" Loading
src/Benchmarks/GEM/gem-CPU/GEM_impl.h→src/Benchmarks/GEM/gem/GEM_impl.h +44 −29 Original line number Diff line number Diff line Loading @@ -13,6 +13,8 @@ #pragma once #include <TNL/Assert.h> #include "GEMdevice.h" #include "GEM.h" #include <fstream> template< typename Real, Loading @@ -22,11 +24,20 @@ GEM< Real, Device, Index >::GEM( MatrixGEM& A, Array& b ) : A(A), b(b) {} template< typename Real, typename Device, typename Index > bool GEM< Real, Device, Index >::solve( Array& x, const TNL::String& pivoting, int verbose ) { printf("in solve\n"); return DeviceDependentCode::SolveGEM( *this, x, pivoting, verbose ); } template< typename Real, typename Device, typename Index > bool GEM< Real, Device, Index >::solve( Array& x, int verbose ) bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose ) { assert( b.getSize() == x.getSize() ); Loading @@ -40,6 +51,7 @@ bool GEM< Real, Device, Index >::solve( Array& x, int verbose ) /**** * Divide the k-th row by pivot */ if( verbose > 1 ) if( k % 10 == 0 ) std::cout << "Elimination: " << k << "/" << n << std::endl; const Real& pivot = this->A.getElement( k, k ); Loading Loading @@ -106,7 +118,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose ) for( int k = 0; k < n; k++ ) { if( verbose > 1 ) std::cout << "Step " << k << "/" << n << ".... \r"; std::cout << "Step " << k << "/" << n << "....\n"; //"\r"; /**** * Find the pivot - the largest in k-th row */ Loading Loading @@ -255,35 +267,38 @@ void GEM< Real, Device, Index >::print( std::ostream& str ) const str << " | " << std::setprecision( precision ) << std::setw( precision + 6 ) << b[ row ] << " |" << std::endl; } } #ifdef HAVE_CUDA /*template < typename Real, typename Index >*/ __global__ void GEMKernel( /*Matrix< Real, Devices::Cuda, Index >& matrix, Containers::Array< Real, Devices::Cuda, Index >& b, Containers::Array< Real, Devices::Cuda, Index >& x, int verbose*/ ) template <> class GEMDeviceDependentCode< TNL::Devices::Host > { public: typedef TNL::Devices::Host DeviceType; template < typename Real, typename Index> static bool SolveGEM( GEM< Real, DeviceType, Index>& gem, TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose ) { unsigned int i = threadIdx.x;// + blockDim.x * blockIdx.x; printf( "%d\n" , i ); printf("starting computation on CPU\n"); return pivoting == "yes" ? gem.solveWithPivoting( x, verbose ) : gem.solveWithoutPivoting(x, verbose ); } }; #endif template <> class GEMDeviceDependentCode< TNL::Devices::Cuda > { public: typedef TNL::Devices::Cuda DeviceType; /*template< typename Real, typename Device, typename Index > bool Matrix< Real, Device, Index >::solveCuda( Array& b, Array& x, int verbose ) template < typename Real, typename Index> static bool SolveGEM( GEM< Real, DeviceType, Index>& gem, TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose ) { #ifdef HAVE_CUDA Index numRows = this->A.getNumRows(); Index numColumns = this->A.getNumColumns(); Index numThreads = 32; Index numBlocks = roundUpDivision( numRows, numThreads ); // TODO: num of blocks needed higher than num of blocks able to calculate Devices::Cuda::synchronizeDevice(); GEMKernel<<< 1, 1 >>>( this, b, x, verbose ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("starting computation on GPU\n"); gem.GEMdevice( x, pivoting, verbose ); #endif }*/ return true; } };
src/Benchmarks/GEM/gem-GPU/GEMdevice.h→src/Benchmarks/GEM/gem/GEMdevice.h +44 −53 Original line number Diff line number Diff line #include "GEMkernels.h" #include "TNL/Cuda/MemoryHelpers.h" #define DEBUG 0 #ifdef HAVE_CUDA #include "GEMkernels.h" template < typename Real, typename Index > void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev, void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& result_vector_dev ) TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& x ) { Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrixDev); Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrix); int blockSize = matrixDev.getNumRows() > 1024 ? 1024 : matrixDev.getNumColumns(); int numBlocksOnColumn = TNL::roundUpDivision( matrixDev.getNumRows(), 1024 ); int numOfBlocks = matrixDev.getNumRows() * numBlocksOnColumn; int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumColumns(); int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 ); int numOfBlocks = matrix.getNumRows() * numBlocksOnColumn; GEMDiagToResult<<< numOfBlocks, blockSize >>>( devMat,device_vector.getView(),result_vector_dev.getView() ); GEMDiagToResult<<< numOfBlocks, blockSize >>>( devMat,device_vector.getView(), x.getView() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } Loading @@ -27,28 +27,27 @@ void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev template < typename Real, typename Device, typename Index > void GEMdevice( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& result_vector_dev ) bool GEM<Real, Device, Index >::GEMdevice( /*Matrix< Real, TNL::Devices::Cuda, Index >& this->A, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, */ Array& x, const TNL::String& pivoting, int verbose ) { Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrixDev); Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( this->A ); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector( this->b ); // FOR PIVOTING SET VARIABLES ON DEVICE size_t size = sizeof(int); int* pivot; cudaMalloc(&pivot, size); for( int colPointer = 0; colPointer < matrixDev.getNumColumns(); colPointer++ ) for( int colPointer = 0; colPointer < this->A.getNumColumns(); colPointer++ ) { int reduceBlockSize = (matrixDev.getNumColumns()-colPointer) > 512 ? 512 : TNL::roundToMultiple( matrixDev.getNumColumns()-colPointer, 32 ); int reduceGridSize = TNL::roundUpDivision( matrixDev.getNumColumns()-colPointer, reduceBlockSize ); // PIVOTING int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 512 ? 512 : TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize ); int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 ); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSizeRound); Loading @@ -68,63 +67,55 @@ void GEMdevice( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev, //printf("%d: position of pivot: %d\n", colPointer, *pom); int blockSize = (matrixDev.getNumColumns()-colPointer) > 1024 ? 1024 : matrixDev.getNumColumns(); int numBlocksOnRow = TNL::roundUpDivision( (matrixDev.getNumColumns()-colPointer), 1024 ); int numOfBlocks = matrixDev.getNumRows() * numBlocksOnRow; int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; //printf( "%d number of threads, %d number of blocks\n", blockSize, numOfBlocks); if( *pom != -1 && *pom != colPointer ) { swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot ); } /*showMatrix<<< 1, 1 >>>( matrixDev ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE;*/ TNL_CHECK_CUDA_DEVICE; } /*int* pom = (int*)malloc(size); *pom = 0; cudaMemcpy(pivot, pom, size, cudaMemcpyHostToDevice); findPivot<<< numBlocksOnRow, 1024 >>>( devMat, colPointer, numBlocksOnRow ); /*printf("\n"); showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n");*/ findRowPivot<<< numBlocksOnRow, blockSize >>>( devMat, colPointer, numBlocksOnRow, d_max, pivot ); GEMColumnUnderDiag<<< numOfBlocks, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaMemcpy( pom, pivot, size, cudaMemcpyDeviceToHost); //printf("%d: position of pivot: %d\n", colPointer, *pom); if( *pom != -1 && *pom != colPointer ) { swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot ); }*/ /*cudaMemcpy( pom, d_max, size, cudaMemcpyDeviceToHost); printf("%d\n", *pom );*/ GEMColumnUnderDiag<<< numOfBlocks, blockSize >>>( devMat, /*int blockSize1 = this->A.getNumRows() > 1024 ? 1024 : this->A.getNumRows(); int gridSize1 = TNL::roundUpDivision( this->A.getNumRows(), blockSize1 ); GEMZeroingMainColumn<<< gridSize1, blockSize1 >>>(devMat, device_vector.getView(), colPointer, numBlocksOnRow ); colPointer ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; /*printf("\n"); showMatrix<<< 1, 1 >>>( matrixDev ); printf("\n"); showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n");*/ printf("\n"); std::cout << device_vector << std::endl;*/ } cudaFree(pivot); //std::cout << device_vector << std::endl; calculateResultSeqCPU( matrixDev, device_vector, result_vector_dev ); calculateResultSeqCPU( this->A, device_vector, x ); return true; } #endif // HAVE_CUDA No newline at end of file