Loading src/Benchmarks/GEM/Matrix/Matrix_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -19,7 +19,7 @@ template < typename Real, typename Index > void Matrix< Real, Device, Index >::setDimensions( const Index rows, const Index columns ) { TNL_ASSERT( rows > 0 && columns > 0, std::cerr << "Matrix cannot have zero rows nor columns!"); TNL_ASSERT( rows > 0 && columns > 0, std::cerr << "Matrix cannot have zero rows nor columns!\n"); this->rows = rows; this->columns = columns; Loading src/Benchmarks/GEM/gem.h +3 −3 Original line number Diff line number Diff line Loading @@ -112,11 +112,11 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix, { typedef Matrix< Real, Devices::Host, Index> MatrixHost; MatrixHost matrixHost; // Get matrix Matrices::MatrixReader< MatrixHost > m; m.readMtxFile( "./test-matrices/" + matrixName, matrixHost ); if( verbose > 1 ) cout << "reading matrix " << matrixName << endl; // Get matrix Matrices::MatrixReader< MatrixHost > m; m.readMtxFile( "./test-matrices/" + matrixName, matrixHost, verbose ); if( verbose > 2 ) matrixHost.showMatrix(); matrix = matrixHost; Loading src/Benchmarks/GEM/gem/GEM_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -144,7 +144,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose ) if( verbose > 1 ) { std::cout << std::endl; std::cout << "Choosing element at " << pivotPosition << "-th row as pivot..." << std::endl; std::cout << "Choosing element at " << pivotPosition << "-th row as pivot with value " << this->A.getElement(k,k)<<"..." << std::endl; std::cout << "Swapping " << k << "-th and " << pivotPosition << "-th rows ... " << std::endl; } Loading src/Benchmarks/GEM/gem/GEMdevice.h +7 −6 Original line number Diff line number Diff line Loading @@ -57,14 +57,15 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting if( pivoting == "yes" ) { // PIVOTING int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 512 ? 512 : int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize ); int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 ); //printf("%d, %d, %d\n", reduceBlockSize, reduceGridSize, reduceGridSizeRound ); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSizeRound); TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSizeRound); outMax.setValue(0); outPos.setValue(0); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSize); TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSize); //outMax.setValue(0); outPos.setValue(0); findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, colPointer, outMax.getView(), outPos.getView() ); cudaDeviceSynchronize(); Loading @@ -83,12 +84,12 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; if( pivoting == "yes" && *pom != -1 && *pom != colPointer ) if( pivoting == "yes" )// && *pom != -1 && *pom != colPointer ) { if( verbose > 1 ) { std::cout << std::endl; std::cout << "Choosing element at " << *pom << "-th row as pivot..." << std::endl; std::cout << "Choosing element at " << *pom << "-th row as pivot with value..." << std::endl; std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl; } swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot ); Loading src/Benchmarks/GEM/gem/GEMkernels.h +25 −54 Original line number Diff line number Diff line Loading @@ -25,18 +25,24 @@ __inline__ __device__ void warpReduceArgMax(Real& val, int& index) { template <typename Real > __inline__ __device__ void blockReduceArgMax(Real& val, int& index) { static __shared__ Real shared[32]; // Shared mem for 32 partial sums static __shared__ Real sharedVal[32]; // Shared mem for 32 partial reduction static __shared__ Real sharedIndex[32]; // Shared mem for 32 partial reduction int lane = threadIdx.x % 32; int wid = threadIdx.x / 32; warpReduceArgMax( val, index); // Each warp performs partial reduction if (lane==0) shared[wid]=val; // Write reduced value to shared memory if (lane==0) { sharedVal[wid]=val; // Write reduced value to shared memory sharedIndex[wid]=index; // Write reduced value to shared memory } __syncthreads(); // Wait for all partial reductions //read from shared memory only if that warp existed val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; val = (threadIdx.x < blockDim.x / 32) ? sharedVal[lane] : 0; index = (threadIdx.x < blockDim.x / 32) ? sharedIndex[lane] : 0; if (wid==0) { Loading @@ -63,23 +69,9 @@ void findPivot( Matrix< Real, TNL::Devices::Cuda, int >* A, blockReduceArgMax( firstElementInRow, index ); if( threadIdx.x == 0 ) { //printf("%d: %.2f\n", index, firstElementInRow ); outMaximum[blockIdx.x] = firstElementInRow; outPosition[blockIdx.x] = index; } //if( rowPointer < A->getNumRows() && rowPointer >= colPointerMain ) { //int pom = __float_as_int( (float)TNL::abs( A->getElement( rowPointer, colPointerMain ) ) ); /*if( TNL::abs( A->getElement(rowPointer, colPointerMain ) ) > 1 ) printf("%d: %d, %.4f\n", rowPointer, pom, TNL::abs( A->getElement(rowPointer, colPointerMain ) ) );*/ //atomicMax( Maximum, pom ); } //if( threadIdx.x == 0 && blockIdx.x == 0 ) // printf("%.4f %d\n", firstElementInRow, index ); } Loading @@ -88,36 +80,14 @@ __global__ void findRowPivot( TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum, TNL::Containers::VectorView< int, TNL::Devices::Cuda, int > outPosition, int* positionPivot ) { /*if( threadIdx.x == 0 ) { printf("outMax = [ "); for( int i = 0; i < blockDim.x; i++ ) printf("%.2f, ", outMaximum[i] ); printf("]\n"); printf("outPos = [ "); for( int i = 0; i < blockDim.x; i++ ) printf("%d, ", outPosition[i] ); printf("]\n"); }*/ int rowPointer = threadIdx.x; Real firstElementInRow = rowPointer >= blockDim.x ? 0 : outMaximum[ rowPointer ]; int index = outPosition[ rowPointer ]; Real firstElementInRow = rowPointer >= outMaximum.getSize() ? 0 : outMaximum[ rowPointer ]; int index = rowPointer >= outPosition.getSize() ? 0 : outPosition[ rowPointer ]; blockReduceArgMax( firstElementInRow, index ); //printf("%d: %.2f\n", index, firstElementInRow ); if( threadIdx.x == 0 ) { *positionPivot = index; } /*int rowPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; if( rowPointer >= colPointerMain && rowPointer < A->getNumRows() && __float_as_int( (float)TNL::abs( A->getElement( rowPointer, colPointerMain ) ) ) == *Maximum ) { atomicExch( positionPivot, rowPointer); }*/ } Loading @@ -127,8 +97,8 @@ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain, int numBlocksOnRow, int* positionPivot ) { //if( threadIdx.x == 0 && blockIdx.x == 0 ) // printf("\nChoosing element at %d-th row as pivot...\nSwapping %d-th and %d-th rows...\n",*positionPivot,colPointerMain,*positionPivot ); if( *positionPivot > colPointerMain ) { int rowPointer1 = colPointerMain; int rowPointer2 = *positionPivot; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; Loading @@ -145,6 +115,7 @@ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, } } } } template <typename Real > Loading Loading
src/Benchmarks/GEM/Matrix/Matrix_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -19,7 +19,7 @@ template < typename Real, typename Index > void Matrix< Real, Device, Index >::setDimensions( const Index rows, const Index columns ) { TNL_ASSERT( rows > 0 && columns > 0, std::cerr << "Matrix cannot have zero rows nor columns!"); TNL_ASSERT( rows > 0 && columns > 0, std::cerr << "Matrix cannot have zero rows nor columns!\n"); this->rows = rows; this->columns = columns; Loading
src/Benchmarks/GEM/gem.h +3 −3 Original line number Diff line number Diff line Loading @@ -112,11 +112,11 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix, { typedef Matrix< Real, Devices::Host, Index> MatrixHost; MatrixHost matrixHost; // Get matrix Matrices::MatrixReader< MatrixHost > m; m.readMtxFile( "./test-matrices/" + matrixName, matrixHost ); if( verbose > 1 ) cout << "reading matrix " << matrixName << endl; // Get matrix Matrices::MatrixReader< MatrixHost > m; m.readMtxFile( "./test-matrices/" + matrixName, matrixHost, verbose ); if( verbose > 2 ) matrixHost.showMatrix(); matrix = matrixHost; Loading
src/Benchmarks/GEM/gem/GEM_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -144,7 +144,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose ) if( verbose > 1 ) { std::cout << std::endl; std::cout << "Choosing element at " << pivotPosition << "-th row as pivot..." << std::endl; std::cout << "Choosing element at " << pivotPosition << "-th row as pivot with value " << this->A.getElement(k,k)<<"..." << std::endl; std::cout << "Swapping " << k << "-th and " << pivotPosition << "-th rows ... " << std::endl; } Loading
src/Benchmarks/GEM/gem/GEMdevice.h +7 −6 Original line number Diff line number Diff line Loading @@ -57,14 +57,15 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting if( pivoting == "yes" ) { // PIVOTING int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 512 ? 512 : int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize ); int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 ); //printf("%d, %d, %d\n", reduceBlockSize, reduceGridSize, reduceGridSizeRound ); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSizeRound); TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSizeRound); outMax.setValue(0); outPos.setValue(0); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSize); TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSize); //outMax.setValue(0); outPos.setValue(0); findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, colPointer, outMax.getView(), outPos.getView() ); cudaDeviceSynchronize(); Loading @@ -83,12 +84,12 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; if( pivoting == "yes" && *pom != -1 && *pom != colPointer ) if( pivoting == "yes" )// && *pom != -1 && *pom != colPointer ) { if( verbose > 1 ) { std::cout << std::endl; std::cout << "Choosing element at " << *pom << "-th row as pivot..." << std::endl; std::cout << "Choosing element at " << *pom << "-th row as pivot with value..." << std::endl; std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl; } swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot ); Loading
src/Benchmarks/GEM/gem/GEMkernels.h +25 −54 Original line number Diff line number Diff line Loading @@ -25,18 +25,24 @@ __inline__ __device__ void warpReduceArgMax(Real& val, int& index) { template <typename Real > __inline__ __device__ void blockReduceArgMax(Real& val, int& index) { static __shared__ Real shared[32]; // Shared mem for 32 partial sums static __shared__ Real sharedVal[32]; // Shared mem for 32 partial reduction static __shared__ Real sharedIndex[32]; // Shared mem for 32 partial reduction int lane = threadIdx.x % 32; int wid = threadIdx.x / 32; warpReduceArgMax( val, index); // Each warp performs partial reduction if (lane==0) shared[wid]=val; // Write reduced value to shared memory if (lane==0) { sharedVal[wid]=val; // Write reduced value to shared memory sharedIndex[wid]=index; // Write reduced value to shared memory } __syncthreads(); // Wait for all partial reductions //read from shared memory only if that warp existed val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; val = (threadIdx.x < blockDim.x / 32) ? sharedVal[lane] : 0; index = (threadIdx.x < blockDim.x / 32) ? sharedIndex[lane] : 0; if (wid==0) { Loading @@ -63,23 +69,9 @@ void findPivot( Matrix< Real, TNL::Devices::Cuda, int >* A, blockReduceArgMax( firstElementInRow, index ); if( threadIdx.x == 0 ) { //printf("%d: %.2f\n", index, firstElementInRow ); outMaximum[blockIdx.x] = firstElementInRow; outPosition[blockIdx.x] = index; } //if( rowPointer < A->getNumRows() && rowPointer >= colPointerMain ) { //int pom = __float_as_int( (float)TNL::abs( A->getElement( rowPointer, colPointerMain ) ) ); /*if( TNL::abs( A->getElement(rowPointer, colPointerMain ) ) > 1 ) printf("%d: %d, %.4f\n", rowPointer, pom, TNL::abs( A->getElement(rowPointer, colPointerMain ) ) );*/ //atomicMax( Maximum, pom ); } //if( threadIdx.x == 0 && blockIdx.x == 0 ) // printf("%.4f %d\n", firstElementInRow, index ); } Loading @@ -88,36 +80,14 @@ __global__ void findRowPivot( TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum, TNL::Containers::VectorView< int, TNL::Devices::Cuda, int > outPosition, int* positionPivot ) { /*if( threadIdx.x == 0 ) { printf("outMax = [ "); for( int i = 0; i < blockDim.x; i++ ) printf("%.2f, ", outMaximum[i] ); printf("]\n"); printf("outPos = [ "); for( int i = 0; i < blockDim.x; i++ ) printf("%d, ", outPosition[i] ); printf("]\n"); }*/ int rowPointer = threadIdx.x; Real firstElementInRow = rowPointer >= blockDim.x ? 0 : outMaximum[ rowPointer ]; int index = outPosition[ rowPointer ]; Real firstElementInRow = rowPointer >= outMaximum.getSize() ? 0 : outMaximum[ rowPointer ]; int index = rowPointer >= outPosition.getSize() ? 0 : outPosition[ rowPointer ]; blockReduceArgMax( firstElementInRow, index ); //printf("%d: %.2f\n", index, firstElementInRow ); if( threadIdx.x == 0 ) { *positionPivot = index; } /*int rowPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; if( rowPointer >= colPointerMain && rowPointer < A->getNumRows() && __float_as_int( (float)TNL::abs( A->getElement( rowPointer, colPointerMain ) ) ) == *Maximum ) { atomicExch( positionPivot, rowPointer); }*/ } Loading @@ -127,8 +97,8 @@ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain, int numBlocksOnRow, int* positionPivot ) { //if( threadIdx.x == 0 && blockIdx.x == 0 ) // printf("\nChoosing element at %d-th row as pivot...\nSwapping %d-th and %d-th rows...\n",*positionPivot,colPointerMain,*positionPivot ); if( *positionPivot > colPointerMain ) { int rowPointer1 = colPointerMain; int rowPointer2 = *positionPivot; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; Loading @@ -145,6 +115,7 @@ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, } } } } template <typename Real > Loading