Loading src/Benchmarks/GEM/gem/GEMdevice.h +20 −11 Original line number Diff line number Diff line Loading @@ -14,8 +14,8 @@ void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, { Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrix); int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumColumns(); int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 ); int blockSize = matrix.getNumRows() > 256 ? 256 : matrix.getNumColumns(); int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 256 ); int numOfBlocks = matrix.getNumRows() * numBlocksOnColumn; Loading Loading @@ -54,10 +54,16 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting if( verbose > 1 ) printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() ); if( verbose > 2 ){ showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } if( pivoting == "yes" ) { // PIVOTING int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 256 ? 256 : TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize ); int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 ); Loading @@ -79,9 +85,10 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting } int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; int blockSize = this->A.getNumRows() * (this->A.getNumColumns()-colPointer) > 256 ? 256 : this->A.getNumRows() * ( this->A.getNumColumns() - colPointer ); int gridSize = TNL::roundUpDivision( this->A.getNumRows() * (this->A.getNumColumns()-colPointer), blockSize ); //printf( "%d: %d, %d\n", colPointer, blockSize, numOfBlocks ); if( pivoting == "yes" )// && *pom != -1 && *pom != colPointer ) Loading @@ -92,15 +99,17 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting std::cout << "Choosing element at " << *pom << "-th row as pivot with value..." << std::endl; std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl; } swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot ); int blockSizeSwap = this->A.getNumColumns()-colPointer > 256 ?256 : this->A.getNumColumns() - colPointer; int gridSizeSwap = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, blockSize ); swapRows<<< gridSizeSwap, blockSizeSwap >>>( devMat, device_vector.getView(), colPointer, pivot ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, GEMmainKernel<<< gridSize, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow ); colPointer ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } Loading src/Benchmarks/GEM/gem/GEMdeviceMPI.h +16 −10 Original line number Diff line number Diff line Loading @@ -43,8 +43,8 @@ void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, cudaMemcpy( ( void* ) devMat,( void* ) &matrix, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice ); TNL_CHECK_CUDA_DEVICE; int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumRows(); int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 ); int blockSize = matrix.getNumRows() > 256 ? 256 : matrix.getNumRows(); int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 256 ); int numOfBlocks = matrix.getNumRows() * numBlocksOnColumn; Loading Loading @@ -155,9 +155,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() - colPointerMain + 1 ); // Setting number of threads and blocks for main kernel and for pivoting swapping kernel int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; if( pivoting == "yes" ) Loading @@ -176,11 +173,11 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot outMax.setValue(0); outPos.setValue(-1); // those blocks that have rows to look for pivot in should start looking // those blocks that have rows to look for pivot in, should start looking if( fromRow != this->A.getNumRows() ) { // setting size for kernel of pivoting int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 : int reduceBlockSize = (this->A.getNumRows()-fromRow) > 256 ? 256 : TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize ); int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 ); Loading Loading @@ -272,7 +269,10 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot std::cout << "Choosing element at " << Possition << "-th row as pivot with value..." << std::endl; std::cout << "Swapping " << colPointer << "-th and " << Possition << "-th rows ... " << std::endl; } swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, Possition ); int blockSize = this->A.getNumColumns()-colPointerMain > 256 ? 256 : ( this->A.getNumColumns()-colPointerMain ); int gridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointerMain, blockSize ); swapRows<<< gridSize, blockSize >>>( devMat, device_vector.getView(), colPointerMain, colPointer, Possition ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } Loading Loading @@ -375,6 +375,13 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot } if( verbose > 1 ) printf( "Elimination: %d/%d\n", colPointerMain, this->A.getNumColumns() ); // Setting number of threads and blocks for main kernel int blockSize = this->A.getNumRows() * (this->A.getNumColumns()-colPointerMain) > 256 ? 256 : this->A.getNumRows() * ( this->A.getNumColumns()-colPointerMain ); int numOfBlocks = TNL::roundUpDivision( this->A.getNumRows() * (this->A.getNumColumns()-colPointerMain), blockSize ); //std::cout << mainRowVec << std::endl; // Finally main kernel calculates the GEM for colPointerMain from mainRowVec GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, Loading @@ -382,7 +389,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot mainRowVec.getView(), colPointer, colPointerMain, numBlocksOnRow, processID, numOfProcesses ); cudaDeviceSynchronize(); Loading src/Benchmarks/GEM/gem/GEMkernels.h +9 −7 Original line number Diff line number Diff line Loading @@ -95,19 +95,19 @@ template <typename Real > __global__ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain, int numBlocksOnRow, int* positionPivot ) int colPointerMain, int* positionPivot ) { if( *positionPivot > colPointerMain ) { int rowPointer1 = colPointerMain; int rowPointer2 = *positionPivot; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; int colPointer = threadIdx.x + blockDim.x *blockIdx.x + colPointerMain; if( colPointer < A->getNumColumns() && rowPointer1 < A->getNumRows() ) { Real pom = A->getElement( rowPointer1, colPointer ); A->setElement( rowPointer1, colPointer, A->getElement( rowPointer2, colPointer ) ); A->setElement( rowPointer2, colPointer, pom ); if( colPointer == colPointerMain && blockIdx.x == 0 ) if( colPointer == colPointerMain ) { pom = b[rowPointer1]; b[rowPointer1] = b[rowPointer2]; Loading @@ -122,10 +122,12 @@ template <typename Real > __global__ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain, int numBlocksOnRow ) int colPointerMain ) { int rowPointer = blockIdx.x / numBlocksOnRow; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; int thread = threadIdx.x + blockIdx.x * blockDim.x; int rowPointer = thread / ( A->getNumRows() - colPointerMain ); int colPointer = thread % ( A->getNumRows() - colPointerMain ) + colPointerMain; //printf("%d, %d\n",rowPointer, colPointer ); if( colPointer > colPointerMain && colPointer < A->getNumColumns() && rowPointer != colPointerMain && rowPointer < A->getNumRows() ) { if( A->getElement( colPointerMain, colPointerMain ) != 0 ) Loading @@ -139,7 +141,7 @@ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A, } } else if( colPointer == colPointerMain && rowPointer == colPointerMain ) printf( "Error, pivot is zero!\n"); } if( rowPointer != colPointerMain && threadIdx.x == 0 && blockIdx.x % numBlocksOnRow == 0 && A->getElement( colPointerMain, colPointerMain ) != 0 && A->getElement( rowPointer, colPointerMain ) != 0 ) if( rowPointer < A->getNumRows() && colPointer < A->getNumColumns() && rowPointer != colPointerMain && colPointer == colPointerMain && A->getElement( colPointerMain, colPointerMain ) != 0 && A->getElement( rowPointer, colPointerMain ) != 0 ) { b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * b[ colPointerMain ] / A->getElement( colPointerMain, colPointerMain ); } Loading src/Benchmarks/GEM/gem/GEMkernelsMPI.h +12 −10 Original line number Diff line number Diff line Loading @@ -95,19 +95,19 @@ template <typename Real > __global__ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain, int numBlocksOnRow, int positionPivot ) int colPointerMain, int colPointerPom, int positionPivot ) { if( positionPivot > colPointerMain ) if( positionPivot > colPointerPom ) { int rowPointer1 = colPointerMain; int rowPointer1 = colPointerPom; int rowPointer2 = positionPivot; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; int colPointer = threadIdx.x + blockDim.x * blockIdx.x + colPointerMain; if( colPointer < A->getNumColumns() && rowPointer1 < A->getNumRows() ) { Real pom = A->getElement( rowPointer1, colPointer ); A->setElement( rowPointer1, colPointer, A->getElement( rowPointer2, colPointer ) ); A->setElement( rowPointer2, colPointer, pom ); if( colPointer == colPointerMain && blockIdx.x == 0 ) if( colPointer == colPointerMain ) { pom = b[rowPointer1]; b[rowPointer1] = b[rowPointer2]; Loading @@ -123,10 +123,12 @@ __global__ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > mainRow, int rowPointerMain, int colPointerMain, int numBlocksOnRow, int processID, int numOfProcesses ) int rowPointerMain, int colPointerMain, int processID, int numOfProcesses ) { int rowPointer = blockIdx.x / numBlocksOnRow; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; int thread = threadIdx.x + blockDim.x * blockIdx.x; int rowPointer = thread / ( A->getNumColumns() - colPointerMain ); int colPointer = thread % ( A->getNumColumns() - colPointerMain ) + colPointerMain; if( colPointer > colPointerMain && colPointer < A->getNumColumns() && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() ) { Loading @@ -141,8 +143,8 @@ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A, } } else if( colPointer == colPointerMain && rowPointer == colPointerMain ) printf( "Error, pivot is zero!\n"); } if( threadIdx.x == 0 && blockIdx.x % numBlocksOnRow == 0 && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() && mainRow[ 0 ] != 0 && A->getElement( rowPointer, colPointerMain ) != 0 ) if( colPointer == colPointerMain && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() && colPointer < A->getNumColumns() && mainRow[ 0 ] != 0 && A->getElement( rowPointer, colPointerMain ) != 0 ) { b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * mainRow[ mainRow.getSize()-1 ] / mainRow[ 0 ]; } Loading src/Benchmarks/GEM/tnl-gem-script +20 −4 Original line number Diff line number Diff line #!/bin/bash device="both" precision="float" matice="matice6 662_bus" matice="662_bus comsol losmoc msc01050" loops=2 runTest() Loading @@ -15,10 +14,27 @@ runTest() echo ${matrix}.mtx ./tnl-gem-cuda --input-matrix ${matrix}".mtx" \ --device ${device} \ --device GPU \ --loops ${loops} \ --precision ${precision} \ >> ./results/${matrix}.txt >> ./results/${matrix}GPU.txt ./tnl-gem-cuda --input-matrix ${matrix}".mtx" \ --device GPU \ --loops ${loops} \ --precision ${precision} \ --pivoting no \ >> ./results/${matrix}GPU.txt ./tnl-gem --input-matrix ${matrix}".mtx" \ --device CPU \ --loops ${loops} \ --precision ${precision} \ >> ./results/${matrix}CPU.txt ./tnl-gem --input-matrix ${matrix}".mtx" \ --device CPU \ --loops ${loops} \ --precision ${precision} \ --pivoting no \ >> ./results/${matrix}CPU.txt done } Loading Loading
src/Benchmarks/GEM/gem/GEMdevice.h +20 −11 Original line number Diff line number Diff line Loading @@ -14,8 +14,8 @@ void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, { Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrix); int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumColumns(); int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 ); int blockSize = matrix.getNumRows() > 256 ? 256 : matrix.getNumColumns(); int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 256 ); int numOfBlocks = matrix.getNumRows() * numBlocksOnColumn; Loading Loading @@ -54,10 +54,16 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting if( verbose > 1 ) printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() ); if( verbose > 2 ){ showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } if( pivoting == "yes" ) { // PIVOTING int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 256 ? 256 : TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize ); int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 ); Loading @@ -79,9 +85,10 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting } int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; int blockSize = this->A.getNumRows() * (this->A.getNumColumns()-colPointer) > 256 ? 256 : this->A.getNumRows() * ( this->A.getNumColumns() - colPointer ); int gridSize = TNL::roundUpDivision( this->A.getNumRows() * (this->A.getNumColumns()-colPointer), blockSize ); //printf( "%d: %d, %d\n", colPointer, blockSize, numOfBlocks ); if( pivoting == "yes" )// && *pom != -1 && *pom != colPointer ) Loading @@ -92,15 +99,17 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting std::cout << "Choosing element at " << *pom << "-th row as pivot with value..." << std::endl; std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl; } swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot ); int blockSizeSwap = this->A.getNumColumns()-colPointer > 256 ?256 : this->A.getNumColumns() - colPointer; int gridSizeSwap = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, blockSize ); swapRows<<< gridSizeSwap, blockSizeSwap >>>( devMat, device_vector.getView(), colPointer, pivot ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, GEMmainKernel<<< gridSize, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow ); colPointer ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } Loading
src/Benchmarks/GEM/gem/GEMdeviceMPI.h +16 −10 Original line number Diff line number Diff line Loading @@ -43,8 +43,8 @@ void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, cudaMemcpy( ( void* ) devMat,( void* ) &matrix, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice ); TNL_CHECK_CUDA_DEVICE; int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumRows(); int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 ); int blockSize = matrix.getNumRows() > 256 ? 256 : matrix.getNumRows(); int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 256 ); int numOfBlocks = matrix.getNumRows() * numBlocksOnColumn; Loading Loading @@ -155,9 +155,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() - colPointerMain + 1 ); // Setting number of threads and blocks for main kernel and for pivoting swapping kernel int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; if( pivoting == "yes" ) Loading @@ -176,11 +173,11 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot outMax.setValue(0); outPos.setValue(-1); // those blocks that have rows to look for pivot in should start looking // those blocks that have rows to look for pivot in, should start looking if( fromRow != this->A.getNumRows() ) { // setting size for kernel of pivoting int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 : int reduceBlockSize = (this->A.getNumRows()-fromRow) > 256 ? 256 : TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize ); int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 ); Loading Loading @@ -272,7 +269,10 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot std::cout << "Choosing element at " << Possition << "-th row as pivot with value..." << std::endl; std::cout << "Swapping " << colPointer << "-th and " << Possition << "-th rows ... " << std::endl; } swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, Possition ); int blockSize = this->A.getNumColumns()-colPointerMain > 256 ? 256 : ( this->A.getNumColumns()-colPointerMain ); int gridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointerMain, blockSize ); swapRows<<< gridSize, blockSize >>>( devMat, device_vector.getView(), colPointerMain, colPointer, Possition ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } Loading Loading @@ -375,6 +375,13 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot } if( verbose > 1 ) printf( "Elimination: %d/%d\n", colPointerMain, this->A.getNumColumns() ); // Setting number of threads and blocks for main kernel int blockSize = this->A.getNumRows() * (this->A.getNumColumns()-colPointerMain) > 256 ? 256 : this->A.getNumRows() * ( this->A.getNumColumns()-colPointerMain ); int numOfBlocks = TNL::roundUpDivision( this->A.getNumRows() * (this->A.getNumColumns()-colPointerMain), blockSize ); //std::cout << mainRowVec << std::endl; // Finally main kernel calculates the GEM for colPointerMain from mainRowVec GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, Loading @@ -382,7 +389,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot mainRowVec.getView(), colPointer, colPointerMain, numBlocksOnRow, processID, numOfProcesses ); cudaDeviceSynchronize(); Loading
src/Benchmarks/GEM/gem/GEMkernels.h +9 −7 Original line number Diff line number Diff line Loading @@ -95,19 +95,19 @@ template <typename Real > __global__ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain, int numBlocksOnRow, int* positionPivot ) int colPointerMain, int* positionPivot ) { if( *positionPivot > colPointerMain ) { int rowPointer1 = colPointerMain; int rowPointer2 = *positionPivot; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; int colPointer = threadIdx.x + blockDim.x *blockIdx.x + colPointerMain; if( colPointer < A->getNumColumns() && rowPointer1 < A->getNumRows() ) { Real pom = A->getElement( rowPointer1, colPointer ); A->setElement( rowPointer1, colPointer, A->getElement( rowPointer2, colPointer ) ); A->setElement( rowPointer2, colPointer, pom ); if( colPointer == colPointerMain && blockIdx.x == 0 ) if( colPointer == colPointerMain ) { pom = b[rowPointer1]; b[rowPointer1] = b[rowPointer2]; Loading @@ -122,10 +122,12 @@ template <typename Real > __global__ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain, int numBlocksOnRow ) int colPointerMain ) { int rowPointer = blockIdx.x / numBlocksOnRow; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; int thread = threadIdx.x + blockIdx.x * blockDim.x; int rowPointer = thread / ( A->getNumRows() - colPointerMain ); int colPointer = thread % ( A->getNumRows() - colPointerMain ) + colPointerMain; //printf("%d, %d\n",rowPointer, colPointer ); if( colPointer > colPointerMain && colPointer < A->getNumColumns() && rowPointer != colPointerMain && rowPointer < A->getNumRows() ) { if( A->getElement( colPointerMain, colPointerMain ) != 0 ) Loading @@ -139,7 +141,7 @@ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A, } } else if( colPointer == colPointerMain && rowPointer == colPointerMain ) printf( "Error, pivot is zero!\n"); } if( rowPointer != colPointerMain && threadIdx.x == 0 && blockIdx.x % numBlocksOnRow == 0 && A->getElement( colPointerMain, colPointerMain ) != 0 && A->getElement( rowPointer, colPointerMain ) != 0 ) if( rowPointer < A->getNumRows() && colPointer < A->getNumColumns() && rowPointer != colPointerMain && colPointer == colPointerMain && A->getElement( colPointerMain, colPointerMain ) != 0 && A->getElement( rowPointer, colPointerMain ) != 0 ) { b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * b[ colPointerMain ] / A->getElement( colPointerMain, colPointerMain ); } Loading
src/Benchmarks/GEM/gem/GEMkernelsMPI.h +12 −10 Original line number Diff line number Diff line Loading @@ -95,19 +95,19 @@ template <typename Real > __global__ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain, int numBlocksOnRow, int positionPivot ) int colPointerMain, int colPointerPom, int positionPivot ) { if( positionPivot > colPointerMain ) if( positionPivot > colPointerPom ) { int rowPointer1 = colPointerMain; int rowPointer1 = colPointerPom; int rowPointer2 = positionPivot; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; int colPointer = threadIdx.x + blockDim.x * blockIdx.x + colPointerMain; if( colPointer < A->getNumColumns() && rowPointer1 < A->getNumRows() ) { Real pom = A->getElement( rowPointer1, colPointer ); A->setElement( rowPointer1, colPointer, A->getElement( rowPointer2, colPointer ) ); A->setElement( rowPointer2, colPointer, pom ); if( colPointer == colPointerMain && blockIdx.x == 0 ) if( colPointer == colPointerMain ) { pom = b[rowPointer1]; b[rowPointer1] = b[rowPointer2]; Loading @@ -123,10 +123,12 @@ __global__ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > mainRow, int rowPointerMain, int colPointerMain, int numBlocksOnRow, int processID, int numOfProcesses ) int rowPointerMain, int colPointerMain, int processID, int numOfProcesses ) { int rowPointer = blockIdx.x / numBlocksOnRow; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; int thread = threadIdx.x + blockDim.x * blockIdx.x; int rowPointer = thread / ( A->getNumColumns() - colPointerMain ); int colPointer = thread % ( A->getNumColumns() - colPointerMain ) + colPointerMain; if( colPointer > colPointerMain && colPointer < A->getNumColumns() && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() ) { Loading @@ -141,8 +143,8 @@ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A, } } else if( colPointer == colPointerMain && rowPointer == colPointerMain ) printf( "Error, pivot is zero!\n"); } if( threadIdx.x == 0 && blockIdx.x % numBlocksOnRow == 0 && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() && mainRow[ 0 ] != 0 && A->getElement( rowPointer, colPointerMain ) != 0 ) if( colPointer == colPointerMain && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() && colPointer < A->getNumColumns() && mainRow[ 0 ] != 0 && A->getElement( rowPointer, colPointerMain ) != 0 ) { b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * mainRow[ mainRow.getSize()-1 ] / mainRow[ 0 ]; } Loading
src/Benchmarks/GEM/tnl-gem-script +20 −4 Original line number Diff line number Diff line #!/bin/bash device="both" precision="float" matice="matice6 662_bus" matice="662_bus comsol losmoc msc01050" loops=2 runTest() Loading @@ -15,10 +14,27 @@ runTest() echo ${matrix}.mtx ./tnl-gem-cuda --input-matrix ${matrix}".mtx" \ --device ${device} \ --device GPU \ --loops ${loops} \ --precision ${precision} \ >> ./results/${matrix}.txt >> ./results/${matrix}GPU.txt ./tnl-gem-cuda --input-matrix ${matrix}".mtx" \ --device GPU \ --loops ${loops} \ --precision ${precision} \ --pivoting no \ >> ./results/${matrix}GPU.txt ./tnl-gem --input-matrix ${matrix}".mtx" \ --device CPU \ --loops ${loops} \ --precision ${precision} \ >> ./results/${matrix}CPU.txt ./tnl-gem --input-matrix ${matrix}".mtx" \ --device CPU \ --loops ${loops} \ --precision ${precision} \ --pivoting no \ >> ./results/${matrix}CPU.txt done } Loading