Loading src/Benchmarks/GEM/Matrix/Matrix.h +12 −0 Original line number Diff line number Diff line Loading @@ -37,6 +37,7 @@ class Matrix void setDimensions( const Index rows, const Index columns ); void setCompressedRowLengths( CompressedRowLengthsVector& rowLengths ){ rowLengths.setValue(this->getNumberOfMatrixElements());} IndexType getNumberOfMatrixElements(){return this->getNumRows() * this->getNumColumns();} /** * Classic setElement method. Loading Loading @@ -85,6 +86,17 @@ class Matrix __cuda_callable__ Index getNumColumns() const{return this->columns;} Index getNumNonzeros() { Index nonzeros = 0; Matrix< Real, TNL::Devices::Host, Index> m; m = *this; for( int i = 0; i < m.getNumRows(); i++ ) for( int j = 0; j < m.getNumColumns(); j++ ) if( m.getElement(i,j) != 0 ) nonzeros++; return nonzeros; } /** * Function for switching 2 rows in matrix from column further. E.g. in matrix 5 x 5 * switch rows row1 = 1 and row2 = 2 from column = 1 to columns = 5. Loading src/Benchmarks/GEM/gem.h +44 −68 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ #include "Matrix/Matrix.h" #include "gem/GEM.h" #include "TNL/tnl-dev/src/TNL/Math.h" //#include "gem/GEMdevice.h" #include <typeinfo> // type printf #include <TNL/Matrices/MatrixReader.h> #include "TNL/tnl-dev/src/TNL/Cuda/CheckDevice.h" Loading Loading @@ -31,7 +31,7 @@ void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, const String& matrixName, const String& vectorName ); const String& matrixName, const String& vectorName, const int verbose ); template < typename Real, Loading @@ -46,19 +46,20 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve MatrixType matrix; VectorType vector; readMatrixVector( matrix, vector, matrixName, vectorName ); readMatrixVector( matrix, vector, matrixName, vectorName, verbose ); VectorType vectorResult( matrix.getNumRows() ); // Computation double* time; time = new double[ loops ]; double error = -1; std::cout << "Starting computation on " << device << endl; if( verbose > 1 ) cout << "Starting computation on " << device << endl; for( int i = 0; i < loops; i++ ) { MatrixType matrixComp = matrix; VectorType vectorComp( vector ); vectorComp.setValue( 0 ); vectorResult.setValue( 0 ); GEM< Real, Device, Index > gem( matrixComp, vectorComp ); Loading @@ -68,85 +69,56 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve start = std::clock(); if( verbose > 1 ) cout << "starting computation number " << i+1 << endl; gem.solve( vectorResult, pivoting, 0 ); gem.solve( vectorResult, pivoting, verbose ); duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; time[i] = duration; if( vectorName == "none" ) { double l2norm = 0; for( int j = 0; j < matrix.getNumRows(); j++ ) l2norm += (vectorResult.getElement( i ) - 1)*(vectorResult.getElement( i ) - 1); for( int j = 0; j < vectorResult.getSize(); j++ ) l2norm += (vectorResult.getElement( j ) - 1)*(vectorResult.getElement( j ) - 1); l2norm = std::sqrt(l2norm); printf( " %.4f ", l2norm); error = l2norm; if( verbose > 1 ) printf( "Norm in %d calculation is %.4f \n", i+1, l2norm); } } if( verbose > 1 ) printf("\n ... done!\n"); delete []time; return vectorResult; /* //#ifdef HAVE_CUDA #ifdef COMPARE_RESULTS double error = 0.0; //std::cout << "Results:\nrow:"<< setw(20) << "Host" << setw(20) << "Device" << setw(20) << "Error" << std::endl; for( int i = 0; i < matrix.getNumRows(); i++ ) { double errorPom = ( result_vector.getElement(i) - result_vector_dev.getElement(i) ) * ( result_vector.getElement(i) - result_vector_dev.getElement(i) ); //std::cout << i << ": " << setw(20) << result_vector.getElement(i) << setw(20) << result_vector_dev.getElement(i) << setw(20) << errorPom << std::endl; error += errorPom; } double timeMean = 0; for( int i = 0; i < loops; i++ ) timeMean += time[i]; timeMean /= loops; error = std::sqrt(error); printf("Difference in L2 norm from Device and Host is %.8f\n", error ); printf("%20s %15d %15d %20s %15s %15s %10d %15.5f %15.5f\n", matrixName.c_str(), matrix.getNumRows(), matrix.getNumNonzeros(), vectorName == "none" ? "-":vectorName.c_str(), device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, timeMean, error); double CPUmean(0), GPUmean(0); cout << "Timers: " << endl << "CPU: [ "; for( int i = 0; i < loops; i++ ) { CPUmean += timeCPU[i]; GPUmean += timeGPU[i]; cout << timeCPU[i] << " "; } CPUmean = CPUmean/loops; GPUmean = GPUmean/loops; delete []time; cout << "]" << endl; cout << "GPU: [ "; for( int i = 0; i < loops; i++ ) cout << timeGPU[i] << " "; cout << "]" << endl; cout << "CPU mean time: " << CPUmean << endl; cout << "GPU mean time: " << GPUmean << endl; #endif #endif delete []timeCPU; #ifdef HAVE_CUDA delete []timeGPU; #endif return;*/ return vectorResult; } template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, const String& matrixName, const String& vectorName ) const String& matrixName, const String& vectorName, const int verbose ) { typedef Matrix< Real, Devices::Host, Index> MatrixHost; MatrixHost matrixHost; // Get matrix Matrices::MatrixReader< MatrixHost > m; m.readMtxFile( "./test-matrices/" + matrixName, matrixHost ); if( verbose > 1 ) cout << "reading matrix " << matrixName << endl; //matrixHost.showMatrix(); if( verbose > 2 ) matrixHost.showMatrix(); matrix = matrixHost; // Get vector Loading @@ -155,8 +127,13 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix, if( vectorName == "none" ) calculHostVecOne( matrixHost, vectorHost, vectorName ); else { readVector( vectorHost, vectorName ); if( verbose > 1 ) cout << "reading vector " << vectorName << endl; } if( verbose > 2 ) cout << vectorHost << endl; vector = vectorHost; } Loading @@ -174,9 +151,8 @@ void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix, } vector[i] = pom; } cout << endl; ofstream outdata; // outdata is like cin /*ofstream outdata; // outdata is like cin outdata.open("./test-matrices/" + vectorName ); // opens the file if( !outdata ) { // file couldn't be opened Loading @@ -189,7 +165,7 @@ void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix, outdata << vector[i] << endl; } outdata.close(); cout << endl; cout << endl;*/ } template < typename Real, typename Index > Loading src/Benchmarks/GEM/gem/GEM_impl.h +13 −12 Original line number Diff line number Diff line Loading @@ -29,7 +29,6 @@ template< typename Real, typename Index > bool GEM< Real, Device, Index >::solve( Array& x, const TNL::String& pivoting, int verbose ) { printf("in solve\n"); return DeviceDependentCode::SolveGEM( *this, x, pivoting, verbose ); } Loading @@ -43,7 +42,7 @@ bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose ) const int n = this->A.getNumRows(); if( verbose > 1 ) if( verbose > 2 ) this->print(); for( int k = 0; k < n; k++ ) Loading @@ -65,7 +64,7 @@ bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose ) this->A.setElement( k, j, this->A.getElement( k, j )/pivot ); this->A.setElement( k, k, 1.0 ); if( verbose > 1 ) if( verbose > 2 ) { std::cout << "Dividing by the pivot ... " << std::endl; this->print(); Loading @@ -82,7 +81,7 @@ bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose ) this->A.setElement( i, k, 0.0 ); } if( verbose > 1 ) if( verbose > 2 ) { std::cout << "Subtracting the " << k << "-th row from the rows bellow ... " << std::endl; this->print(); Loading Loading @@ -112,13 +111,13 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose ) const int n = this->A.getNumRows(); if( verbose > 1 ) if( verbose > 2 ) this->print(); for( int k = 0; k < n; k++ ) { if( verbose > 1 ) std::cout << "Step " << k << "/" << n << "....\n"; //"\r"; std::cout << "Step " << k << "/" << n << "....\n"; /**** * Find the pivot - the largest in k-th row */ Loading @@ -142,11 +141,11 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose ) b[ pivotPosition ] = pom; } if( verbose > 0 ) if( verbose > 1 ) { std::cout << std::endl; std::cout << "Choosing element at " << pivotPosition << "-th row as pivot..." << std::endl; std::cout << "Swaping " << k << "-th and " << pivotPosition << "-th rows ... " << std::endl; std::cout << "Swapping " << k << "-th and " << pivotPosition << "-th rows ... " << std::endl; } /**** Loading @@ -163,7 +162,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose ) this->A.setElement( k, j, this->A.getElement( k, j)/pivot ); this->A.setElement( k, k, 1.0 ); if( verbose > 1 ) if( verbose > 2 ) { std::cout << "Dividing by the pivot ... " << std::endl; this->print(); Loading @@ -180,7 +179,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose ) this->A.setElement( i, k , 0.0 ); } if( verbose > 1 ) if( verbose > 2 ) { std::cout << "Subtracting the " << k << "-th row from the rows bellow ... " << std::endl; this->print(); Loading Loading @@ -278,7 +277,8 @@ class GEMDeviceDependentCode< TNL::Devices::Host > static bool SolveGEM( GEM< Real, DeviceType, Index>& gem, TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose ) { printf("starting computation on CPU\n"); if( verbose > 1 ) printf("Starting the computation SolveGEM.\n"); return pivoting == "yes" ? gem.solveWithPivoting( x, verbose ) : gem.solveWithoutPivoting(x, verbose ); } }; Loading @@ -295,7 +295,8 @@ class GEMDeviceDependentCode< TNL::Devices::Cuda > TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose ) { #ifdef HAVE_CUDA printf("starting computation on GPU\n"); if( verbose > 1 ) printf("Starting the computation SolveGEM.\n"); gem.GEMdevice( x, pivoting, verbose ); #endif return true; Loading src/Benchmarks/GEM/gem/GEMdevice.h +50 −56 Original line number Diff line number Diff line Loading @@ -8,7 +8,7 @@ template < typename Real, typename Index > void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& x ) { Loading @@ -30,20 +30,32 @@ void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, template < typename Real, typename Device, typename Index > bool GEM<Real, Device, Index >::GEMdevice( /*Matrix< Real, TNL::Devices::Cuda, Index >& this->A, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, */ Array& x, const TNL::String& pivoting, int verbose ) bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting, int verbose ) { Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( this->A ); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector( this->b ); // FOR PIVOTING SET VARIABLES ON DEVICE size_t size = sizeof(int); int* pivot; cudaMalloc(&pivot, size); int* pom = (int*)malloc(size); *pom = -1; if( verbose > 2 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n"); } for( int colPointer = 0; colPointer < this->A.getNumColumns(); colPointer++ ) { if( verbose > 1 ) printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() ); if( pivoting == "yes" ) { // PIVOTING int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 512 ? 512 : TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 ); Loading @@ -61,59 +73,41 @@ bool GEM<Real, Device, Index >::GEMdevice( /*Matrix< Real, TNL::Devices::Cuda, I findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; int* pom = (int*)malloc(size); *pom = 0; *pom = 0; cudaMemcpy( pom, pivot, size, cudaMemcpyDeviceToHost); //printf("%d: position of pivot: %d\n", colPointer, *pom); } int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; //printf( "%d number of threads, %d number of blocks\n", blockSize, numOfBlocks); if( *pom != -1 && *pom != colPointer ) if( pivoting == "yes" && *pom != -1 && *pom != colPointer ) { if( verbose > 1 ) { std::cout << std::endl; std::cout << "Choosing element at " << *pom << "-th row as pivot..." << std::endl; std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl; } swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } /*printf("\n"); showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n");*/ GEMColumnUnderDiag<<< numOfBlocks, blockSize >>>( devMat, GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; /*int blockSize1 = this->A.getNumRows() > 1024 ? 1024 : this->A.getNumRows(); int gridSize1 = TNL::roundUpDivision( this->A.getNumRows(), blockSize1 ); GEMZeroingMainColumn<<< gridSize1, blockSize1 >>>(devMat, device_vector.getView(), colPointer ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n"); showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n"); std::cout << device_vector << std::endl;*/ } cudaFree(pivot); //std::cout << device_vector << std::endl; free(pom); calculateResultSeqCPU( this->A, device_vector, x ); calculResultVector( this->A, device_vector, x ); return true; } Loading src/Benchmarks/GEM/gem/GEMkernels.h +1 −23 Original line number Diff line number Diff line Loading @@ -6,22 +6,18 @@ template <typename Real > __inline__ __device__ void warpReduceArgMax(Real& val, int& index) { //printf( "index = %d\n", index ); __syncthreads(); for (int offset = 32/2; offset > 0; offset /= 2) { Real val1 = __shfl_down_sync( 0xffffffff, val, offset, 32); int index1 = __shfl_down_sync( 0xffffffff, index, offset, 32); __syncthreads(); //printf("%d: firstElementInRow = %.4f, %d: val1 = %.4f\n", index, val, index1, val1 ); if( TNL::abs( val1 ) - TNL::abs( val ) > 0 ) { val = val1; index = index1; //printf("%d: %.4f\n", index, firstElementInRow ); } __syncthreads(); //printf("%d: firstElementInRow = %.4f\n", index, val ); } } Loading Loading @@ -150,32 +146,15 @@ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, } } /*template <typename Real > __global__ void GEMZeroingMainColumn( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain ) { //int rowPointer = threadIdx.x + blockDim.x * blockIdx.x; //if( rowPointer < A->getNumRows() && rowPointer != colPointerMain ) // A->setElement( rowPointer, colPointerMain, 0.0 ); A->setElement( colPointerMain, colPointer, A->getElement( colPointerMain, colPointer ) / A->getElement( colPointerMain, colPointerMain ) ); if( colPointer == colPointerMain+1 ) b[ colPointerMain ] /= A->getElement( colPointerMain, colPointerMain ); }*/ template <typename Real > __global__ void GEMColumnUnderDiag( Matrix< Real, TNL::Devices::Cuda, int >* A, void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain, int numBlocksOnRow ) { int rowPointer = blockIdx.x / numBlocksOnRow; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; /*if( rowPointer == colPointerMain && colPointer == colPointerMain ) A->setElement( colPointerMain, colPointerMain, 1 );*/ if( colPointer > colPointerMain && colPointer < A->getNumColumns() && rowPointer != colPointerMain && rowPointer < A->getNumRows() ) { if( A->getElement( colPointerMain, colPointerMain ) != 0 ) Loading @@ -192,7 +171,6 @@ void GEMColumnUnderDiag( Matrix< Real, TNL::Devices::Cuda, int >* A, if( rowPointer != colPointerMain && threadIdx.x == 0 && blockIdx.x % numBlocksOnRow == 0 && A->getElement( colPointerMain, colPointerMain ) != 0 && A->getElement( rowPointer, colPointerMain ) != 0 ) { b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * b[ colPointerMain ] / A->getElement( colPointerMain, colPointerMain ); //A->setElement( rowPointer, colPointerMain, 0.0 ); } } Loading Loading
src/Benchmarks/GEM/Matrix/Matrix.h +12 −0 Original line number Diff line number Diff line Loading @@ -37,6 +37,7 @@ class Matrix void setDimensions( const Index rows, const Index columns ); void setCompressedRowLengths( CompressedRowLengthsVector& rowLengths ){ rowLengths.setValue(this->getNumberOfMatrixElements());} IndexType getNumberOfMatrixElements(){return this->getNumRows() * this->getNumColumns();} /** * Classic setElement method. Loading Loading @@ -85,6 +86,17 @@ class Matrix __cuda_callable__ Index getNumColumns() const{return this->columns;} Index getNumNonzeros() { Index nonzeros = 0; Matrix< Real, TNL::Devices::Host, Index> m; m = *this; for( int i = 0; i < m.getNumRows(); i++ ) for( int j = 0; j < m.getNumColumns(); j++ ) if( m.getElement(i,j) != 0 ) nonzeros++; return nonzeros; } /** * Function for switching 2 rows in matrix from column further. E.g. in matrix 5 x 5 * switch rows row1 = 1 and row2 = 2 from column = 1 to columns = 5. Loading
src/Benchmarks/GEM/gem.h +44 −68 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ #include "Matrix/Matrix.h" #include "gem/GEM.h" #include "TNL/tnl-dev/src/TNL/Math.h" //#include "gem/GEMdevice.h" #include <typeinfo> // type printf #include <TNL/Matrices/MatrixReader.h> #include "TNL/tnl-dev/src/TNL/Cuda/CheckDevice.h" Loading Loading @@ -31,7 +31,7 @@ void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, const String& matrixName, const String& vectorName ); const String& matrixName, const String& vectorName, const int verbose ); template < typename Real, Loading @@ -46,19 +46,20 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve MatrixType matrix; VectorType vector; readMatrixVector( matrix, vector, matrixName, vectorName ); readMatrixVector( matrix, vector, matrixName, vectorName, verbose ); VectorType vectorResult( matrix.getNumRows() ); // Computation double* time; time = new double[ loops ]; double error = -1; std::cout << "Starting computation on " << device << endl; if( verbose > 1 ) cout << "Starting computation on " << device << endl; for( int i = 0; i < loops; i++ ) { MatrixType matrixComp = matrix; VectorType vectorComp( vector ); vectorComp.setValue( 0 ); vectorResult.setValue( 0 ); GEM< Real, Device, Index > gem( matrixComp, vectorComp ); Loading @@ -68,85 +69,56 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve start = std::clock(); if( verbose > 1 ) cout << "starting computation number " << i+1 << endl; gem.solve( vectorResult, pivoting, 0 ); gem.solve( vectorResult, pivoting, verbose ); duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; time[i] = duration; if( vectorName == "none" ) { double l2norm = 0; for( int j = 0; j < matrix.getNumRows(); j++ ) l2norm += (vectorResult.getElement( i ) - 1)*(vectorResult.getElement( i ) - 1); for( int j = 0; j < vectorResult.getSize(); j++ ) l2norm += (vectorResult.getElement( j ) - 1)*(vectorResult.getElement( j ) - 1); l2norm = std::sqrt(l2norm); printf( " %.4f ", l2norm); error = l2norm; if( verbose > 1 ) printf( "Norm in %d calculation is %.4f \n", i+1, l2norm); } } if( verbose > 1 ) printf("\n ... done!\n"); delete []time; return vectorResult; /* //#ifdef HAVE_CUDA #ifdef COMPARE_RESULTS double error = 0.0; //std::cout << "Results:\nrow:"<< setw(20) << "Host" << setw(20) << "Device" << setw(20) << "Error" << std::endl; for( int i = 0; i < matrix.getNumRows(); i++ ) { double errorPom = ( result_vector.getElement(i) - result_vector_dev.getElement(i) ) * ( result_vector.getElement(i) - result_vector_dev.getElement(i) ); //std::cout << i << ": " << setw(20) << result_vector.getElement(i) << setw(20) << result_vector_dev.getElement(i) << setw(20) << errorPom << std::endl; error += errorPom; } double timeMean = 0; for( int i = 0; i < loops; i++ ) timeMean += time[i]; timeMean /= loops; error = std::sqrt(error); printf("Difference in L2 norm from Device and Host is %.8f\n", error ); printf("%20s %15d %15d %20s %15s %15s %10d %15.5f %15.5f\n", matrixName.c_str(), matrix.getNumRows(), matrix.getNumNonzeros(), vectorName == "none" ? "-":vectorName.c_str(), device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, timeMean, error); double CPUmean(0), GPUmean(0); cout << "Timers: " << endl << "CPU: [ "; for( int i = 0; i < loops; i++ ) { CPUmean += timeCPU[i]; GPUmean += timeGPU[i]; cout << timeCPU[i] << " "; } CPUmean = CPUmean/loops; GPUmean = GPUmean/loops; delete []time; cout << "]" << endl; cout << "GPU: [ "; for( int i = 0; i < loops; i++ ) cout << timeGPU[i] << " "; cout << "]" << endl; cout << "CPU mean time: " << CPUmean << endl; cout << "GPU mean time: " << GPUmean << endl; #endif #endif delete []timeCPU; #ifdef HAVE_CUDA delete []timeGPU; #endif return;*/ return vectorResult; } template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, const String& matrixName, const String& vectorName ) const String& matrixName, const String& vectorName, const int verbose ) { typedef Matrix< Real, Devices::Host, Index> MatrixHost; MatrixHost matrixHost; // Get matrix Matrices::MatrixReader< MatrixHost > m; m.readMtxFile( "./test-matrices/" + matrixName, matrixHost ); if( verbose > 1 ) cout << "reading matrix " << matrixName << endl; //matrixHost.showMatrix(); if( verbose > 2 ) matrixHost.showMatrix(); matrix = matrixHost; // Get vector Loading @@ -155,8 +127,13 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix, if( vectorName == "none" ) calculHostVecOne( matrixHost, vectorHost, vectorName ); else { readVector( vectorHost, vectorName ); if( verbose > 1 ) cout << "reading vector " << vectorName << endl; } if( verbose > 2 ) cout << vectorHost << endl; vector = vectorHost; } Loading @@ -174,9 +151,8 @@ void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix, } vector[i] = pom; } cout << endl; ofstream outdata; // outdata is like cin /*ofstream outdata; // outdata is like cin outdata.open("./test-matrices/" + vectorName ); // opens the file if( !outdata ) { // file couldn't be opened Loading @@ -189,7 +165,7 @@ void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix, outdata << vector[i] << endl; } outdata.close(); cout << endl; cout << endl;*/ } template < typename Real, typename Index > Loading
src/Benchmarks/GEM/gem/GEM_impl.h +13 −12 Original line number Diff line number Diff line Loading @@ -29,7 +29,6 @@ template< typename Real, typename Index > bool GEM< Real, Device, Index >::solve( Array& x, const TNL::String& pivoting, int verbose ) { printf("in solve\n"); return DeviceDependentCode::SolveGEM( *this, x, pivoting, verbose ); } Loading @@ -43,7 +42,7 @@ bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose ) const int n = this->A.getNumRows(); if( verbose > 1 ) if( verbose > 2 ) this->print(); for( int k = 0; k < n; k++ ) Loading @@ -65,7 +64,7 @@ bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose ) this->A.setElement( k, j, this->A.getElement( k, j )/pivot ); this->A.setElement( k, k, 1.0 ); if( verbose > 1 ) if( verbose > 2 ) { std::cout << "Dividing by the pivot ... " << std::endl; this->print(); Loading @@ -82,7 +81,7 @@ bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose ) this->A.setElement( i, k, 0.0 ); } if( verbose > 1 ) if( verbose > 2 ) { std::cout << "Subtracting the " << k << "-th row from the rows bellow ... " << std::endl; this->print(); Loading Loading @@ -112,13 +111,13 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose ) const int n = this->A.getNumRows(); if( verbose > 1 ) if( verbose > 2 ) this->print(); for( int k = 0; k < n; k++ ) { if( verbose > 1 ) std::cout << "Step " << k << "/" << n << "....\n"; //"\r"; std::cout << "Step " << k << "/" << n << "....\n"; /**** * Find the pivot - the largest in k-th row */ Loading @@ -142,11 +141,11 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose ) b[ pivotPosition ] = pom; } if( verbose > 0 ) if( verbose > 1 ) { std::cout << std::endl; std::cout << "Choosing element at " << pivotPosition << "-th row as pivot..." << std::endl; std::cout << "Swaping " << k << "-th and " << pivotPosition << "-th rows ... " << std::endl; std::cout << "Swapping " << k << "-th and " << pivotPosition << "-th rows ... " << std::endl; } /**** Loading @@ -163,7 +162,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose ) this->A.setElement( k, j, this->A.getElement( k, j)/pivot ); this->A.setElement( k, k, 1.0 ); if( verbose > 1 ) if( verbose > 2 ) { std::cout << "Dividing by the pivot ... " << std::endl; this->print(); Loading @@ -180,7 +179,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose ) this->A.setElement( i, k , 0.0 ); } if( verbose > 1 ) if( verbose > 2 ) { std::cout << "Subtracting the " << k << "-th row from the rows bellow ... " << std::endl; this->print(); Loading Loading @@ -278,7 +277,8 @@ class GEMDeviceDependentCode< TNL::Devices::Host > static bool SolveGEM( GEM< Real, DeviceType, Index>& gem, TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose ) { printf("starting computation on CPU\n"); if( verbose > 1 ) printf("Starting the computation SolveGEM.\n"); return pivoting == "yes" ? gem.solveWithPivoting( x, verbose ) : gem.solveWithoutPivoting(x, verbose ); } }; Loading @@ -295,7 +295,8 @@ class GEMDeviceDependentCode< TNL::Devices::Cuda > TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose ) { #ifdef HAVE_CUDA printf("starting computation on GPU\n"); if( verbose > 1 ) printf("Starting the computation SolveGEM.\n"); gem.GEMdevice( x, pivoting, verbose ); #endif return true; Loading
src/Benchmarks/GEM/gem/GEMdevice.h +50 −56 Original line number Diff line number Diff line Loading @@ -8,7 +8,7 @@ template < typename Real, typename Index > void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& x ) { Loading @@ -30,20 +30,32 @@ void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, template < typename Real, typename Device, typename Index > bool GEM<Real, Device, Index >::GEMdevice( /*Matrix< Real, TNL::Devices::Cuda, Index >& this->A, TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, */ Array& x, const TNL::String& pivoting, int verbose ) bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting, int verbose ) { Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( this->A ); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector( this->b ); // FOR PIVOTING SET VARIABLES ON DEVICE size_t size = sizeof(int); int* pivot; cudaMalloc(&pivot, size); int* pom = (int*)malloc(size); *pom = -1; if( verbose > 2 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n"); } for( int colPointer = 0; colPointer < this->A.getNumColumns(); colPointer++ ) { if( verbose > 1 ) printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() ); if( pivoting == "yes" ) { // PIVOTING int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 512 ? 512 : TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 ); Loading @@ -61,59 +73,41 @@ bool GEM<Real, Device, Index >::GEMdevice( /*Matrix< Real, TNL::Devices::Cuda, I findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; int* pom = (int*)malloc(size); *pom = 0; *pom = 0; cudaMemcpy( pom, pivot, size, cudaMemcpyDeviceToHost); //printf("%d: position of pivot: %d\n", colPointer, *pom); } int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; //printf( "%d number of threads, %d number of blocks\n", blockSize, numOfBlocks); if( *pom != -1 && *pom != colPointer ) if( pivoting == "yes" && *pom != -1 && *pom != colPointer ) { if( verbose > 1 ) { std::cout << std::endl; std::cout << "Choosing element at " << *pom << "-th row as pivot..." << std::endl; std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl; } swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } /*printf("\n"); showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n");*/ GEMColumnUnderDiag<<< numOfBlocks, blockSize >>>( devMat, GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; /*int blockSize1 = this->A.getNumRows() > 1024 ? 1024 : this->A.getNumRows(); int gridSize1 = TNL::roundUpDivision( this->A.getNumRows(), blockSize1 ); GEMZeroingMainColumn<<< gridSize1, blockSize1 >>>(devMat, device_vector.getView(), colPointer ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n"); showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n"); std::cout << device_vector << std::endl;*/ } cudaFree(pivot); //std::cout << device_vector << std::endl; free(pom); calculateResultSeqCPU( this->A, device_vector, x ); calculResultVector( this->A, device_vector, x ); return true; } Loading
src/Benchmarks/GEM/gem/GEMkernels.h +1 −23 Original line number Diff line number Diff line Loading @@ -6,22 +6,18 @@ template <typename Real > __inline__ __device__ void warpReduceArgMax(Real& val, int& index) { //printf( "index = %d\n", index ); __syncthreads(); for (int offset = 32/2; offset > 0; offset /= 2) { Real val1 = __shfl_down_sync( 0xffffffff, val, offset, 32); int index1 = __shfl_down_sync( 0xffffffff, index, offset, 32); __syncthreads(); //printf("%d: firstElementInRow = %.4f, %d: val1 = %.4f\n", index, val, index1, val1 ); if( TNL::abs( val1 ) - TNL::abs( val ) > 0 ) { val = val1; index = index1; //printf("%d: %.4f\n", index, firstElementInRow ); } __syncthreads(); //printf("%d: firstElementInRow = %.4f\n", index, val ); } } Loading Loading @@ -150,32 +146,15 @@ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, } } /*template <typename Real > __global__ void GEMZeroingMainColumn( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain ) { //int rowPointer = threadIdx.x + blockDim.x * blockIdx.x; //if( rowPointer < A->getNumRows() && rowPointer != colPointerMain ) // A->setElement( rowPointer, colPointerMain, 0.0 ); A->setElement( colPointerMain, colPointer, A->getElement( colPointerMain, colPointer ) / A->getElement( colPointerMain, colPointerMain ) ); if( colPointer == colPointerMain+1 ) b[ colPointerMain ] /= A->getElement( colPointerMain, colPointerMain ); }*/ template <typename Real > __global__ void GEMColumnUnderDiag( Matrix< Real, TNL::Devices::Cuda, int >* A, void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain, int numBlocksOnRow ) { int rowPointer = blockIdx.x / numBlocksOnRow; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; /*if( rowPointer == colPointerMain && colPointer == colPointerMain ) A->setElement( colPointerMain, colPointerMain, 1 );*/ if( colPointer > colPointerMain && colPointer < A->getNumColumns() && rowPointer != colPointerMain && rowPointer < A->getNumRows() ) { if( A->getElement( colPointerMain, colPointerMain ) != 0 ) Loading @@ -192,7 +171,6 @@ void GEMColumnUnderDiag( Matrix< Real, TNL::Devices::Cuda, int >* A, if( rowPointer != colPointerMain && threadIdx.x == 0 && blockIdx.x % numBlocksOnRow == 0 && A->getElement( colPointerMain, colPointerMain ) != 0 && A->getElement( rowPointer, colPointerMain ) != 0 ) { b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * b[ colPointerMain ] / A->getElement( colPointerMain, colPointerMain ); //A->setElement( rowPointer, colPointerMain, 0.0 ); } } Loading