Mapping of threads in gems (MPI, noMPI) changed. (fb53d7b0) · Commits · TNL / tnl-dev

src/Benchmarks/GEM/gem/GEMdevice.h

+20 −11

Original line number	Diff line number	Diff line
		@@ -14,8 +14,8 @@ void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
		{
		Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrix);

		int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumColumns();
		int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 );
		int blockSize = matrix.getNumRows() > 256 ? 256 : matrix.getNumColumns();
		int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 256 );
		int numOfBlocks = matrix.getNumRows() * numBlocksOnColumn;


		@@ -54,10 +54,16 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		if( verbose > 1 )
		printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() );

		if( verbose > 2 ){
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}

		if( pivoting == "yes" )
		{
		// PIVOTING
		int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 :
		int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 256 ? 256 :
		TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 );
		int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize );
		int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );
		@@ -79,9 +85,10 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		}


		int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
		int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
		int numOfBlocks = this->A.getNumRows() * numBlocksOnRow;
		int blockSize = this->A.getNumRows() * (this->A.getNumColumns()-colPointer) > 256 ?
		256 : this->A.getNumRows() * ( this->A.getNumColumns() - colPointer );
		int gridSize = TNL::roundUpDivision( this->A.getNumRows() * (this->A.getNumColumns()-colPointer), blockSize );
		//printf( "%d: %d, %d\n", colPointer, blockSize, numOfBlocks );


		if( pivoting == "yes" )// && pom != -1 && pom != colPointer )
		@@ -92,15 +99,17 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		std::cout << "Choosing element at " << *pom << "-th row as pivot with value..." << std::endl;
		std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl;
		}
		swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
		int blockSizeSwap = this->A.getNumColumns()-colPointer > 256 ?256 : this->A.getNumColumns() - colPointer;
		int gridSizeSwap = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, blockSize );

		swapRows<<< gridSizeSwap, blockSizeSwap >>>( devMat, device_vector.getView(), colPointer, pivot );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}

		GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat,
		GEMmainKernel<<< gridSize, blockSize >>>( devMat,
		device_vector.getView(),
		colPointer,
		numBlocksOnRow );
		colPointer );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}

src/Benchmarks/GEM/gem/GEMdeviceMPI.h

+16 −10

Original line number	Diff line number	Diff line
		@@ -43,8 +43,8 @@ void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
		cudaMemcpy( ( void* ) devMat,( void* ) &matrix, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice );
		TNL_CHECK_CUDA_DEVICE;

		int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumRows();
		int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 );
		int blockSize = matrix.getNumRows() > 256 ? 256 : matrix.getNumRows();
		int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 256 );
		int numOfBlocks = matrix.getNumRows() * numBlocksOnColumn;


		@@ -155,9 +155,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() - colPointerMain + 1 );

		// Setting number of threads and blocks for main kernel and for pivoting swapping kernel
		int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
		int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
		int numOfBlocks = this->A.getNumRows() * numBlocksOnRow;


		if( pivoting == "yes" )
		@@ -176,11 +173,11 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		outMax.setValue(0); outPos.setValue(-1);


		// those blocks that have rows to look for pivot in should start looking
		// those blocks that have rows to look for pivot in, should start looking
		if( fromRow != this->A.getNumRows() )
		{
		// setting size for kernel of pivoting
		int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 :
		int reduceBlockSize = (this->A.getNumRows()-fromRow) > 256 ? 256 :
		TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 );
		int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize );
		int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );
		@@ -272,7 +269,10 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		std::cout << "Choosing element at " << Possition << "-th row as pivot with value..." << std::endl;
		std::cout << "Swapping " << colPointer << "-th and " << Possition << "-th rows ... " << std::endl;
		}
		swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, Possition );
		int blockSize = this->A.getNumColumns()-colPointerMain > 256 ? 256 : ( this->A.getNumColumns()-colPointerMain );
		int gridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointerMain, blockSize );

		swapRows<<< gridSize, blockSize >>>( devMat, device_vector.getView(), colPointerMain, colPointer, Possition );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		@@ -375,6 +375,13 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		}
		if( verbose > 1 )
		printf( "Elimination: %d/%d\n", colPointerMain, this->A.getNumColumns() );

		// Setting number of threads and blocks for main kernel
		int blockSize = this->A.getNumRows() * (this->A.getNumColumns()-colPointerMain) > 256 ?
		256 : this->A.getNumRows() * ( this->A.getNumColumns()-colPointerMain );
		int numOfBlocks = TNL::roundUpDivision( this->A.getNumRows() * (this->A.getNumColumns()-colPointerMain), blockSize );


		//std::cout << mainRowVec << std::endl;
		// Finally main kernel calculates the GEM for colPointerMain from mainRowVec
		GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat,
		@@ -382,7 +389,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		mainRowVec.getView(),
		colPointer,
		colPointerMain,
		numBlocksOnRow,
		processID,
		numOfProcesses );
		cudaDeviceSynchronize();

src/Benchmarks/GEM/gem/GEMkernels.h

+9 −7

Original line number	Diff line number	Diff line
		@@ -95,19 +95,19 @@ template <typename Real >
		__global__
		void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A,
		TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
		int colPointerMain, int numBlocksOnRow, int* positionPivot )
		int colPointerMain, int* positionPivot )
		{
		if( *positionPivot > colPointerMain )
		{
		int rowPointer1 = colPointerMain;
		int rowPointer2 = *positionPivot;
		int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
		int colPointer = threadIdx.x + blockDim.x *blockIdx.x + colPointerMain;
		if( colPointer < A->getNumColumns() && rowPointer1 < A->getNumRows() )
		{
		Real pom = A->getElement( rowPointer1, colPointer );
		A->setElement( rowPointer1, colPointer, A->getElement( rowPointer2, colPointer ) );
		A->setElement( rowPointer2, colPointer, pom );
		if( colPointer == colPointerMain && blockIdx.x == 0 )
		if( colPointer == colPointerMain )
		{
		pom = b[rowPointer1];
		b[rowPointer1] = b[rowPointer2];
		@@ -122,10 +122,12 @@ template <typename Real >
		__global__
		void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A,
		TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
		int colPointerMain, int numBlocksOnRow )
		int colPointerMain )
		{
		int rowPointer = blockIdx.x / numBlocksOnRow;
		int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
		int thread = threadIdx.x + blockIdx.x * blockDim.x;
		int rowPointer = thread / ( A->getNumRows() - colPointerMain );
		int colPointer = thread % ( A->getNumRows() - colPointerMain ) + colPointerMain;
		//printf("%d, %d\n",rowPointer, colPointer );
		if( colPointer > colPointerMain && colPointer < A->getNumColumns() && rowPointer != colPointerMain && rowPointer < A->getNumRows() )
		{
		if( A->getElement( colPointerMain, colPointerMain ) != 0 )
		@@ -139,7 +141,7 @@ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A,
		}
		} else if( colPointer == colPointerMain && rowPointer == colPointerMain ) printf( "Error, pivot is zero!\n");
		}
		if( rowPointer != colPointerMain && threadIdx.x == 0 && blockIdx.x % numBlocksOnRow == 0 && A->getElement( colPointerMain, colPointerMain ) != 0 && A->getElement( rowPointer, colPointerMain ) != 0 )
		if( rowPointer < A->getNumRows() && colPointer < A->getNumColumns() && rowPointer != colPointerMain && colPointer == colPointerMain && A->getElement( colPointerMain, colPointerMain ) != 0 && A->getElement( rowPointer, colPointerMain ) != 0 )
		{
		b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * b[ colPointerMain ] / A->getElement( colPointerMain, colPointerMain );
		}

src/Benchmarks/GEM/gem/GEMkernelsMPI.h

+12 −10

Original line number	Diff line number	Diff line
		@@ -95,19 +95,19 @@ template <typename Real >
		__global__
		void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A,
		TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
		int colPointerMain, int numBlocksOnRow, int positionPivot )
		int colPointerMain, int colPointerPom, int positionPivot )
		{
		if( positionPivot > colPointerMain )
		if( positionPivot > colPointerPom )
		{
		int rowPointer1 = colPointerMain;
		int rowPointer1 = colPointerPom;
		int rowPointer2 = positionPivot;
		int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
		int colPointer = threadIdx.x + blockDim.x * blockIdx.x + colPointerMain;
		if( colPointer < A->getNumColumns() && rowPointer1 < A->getNumRows() )
		{
		Real pom = A->getElement( rowPointer1, colPointer );
		A->setElement( rowPointer1, colPointer, A->getElement( rowPointer2, colPointer ) );
		A->setElement( rowPointer2, colPointer, pom );
		if( colPointer == colPointerMain && blockIdx.x == 0 )
		if( colPointer == colPointerMain )
		{
		pom = b[rowPointer1];
		b[rowPointer1] = b[rowPointer2];
		@@ -123,10 +123,12 @@ __global__
		void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A,
		TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
		TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > mainRow,
		int rowPointerMain, int colPointerMain, int numBlocksOnRow, int processID, int numOfProcesses )
		int rowPointerMain, int colPointerMain, int processID, int numOfProcesses )
		{
		int rowPointer = blockIdx.x / numBlocksOnRow;
		int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
		int thread = threadIdx.x + blockDim.x * blockIdx.x;
		int rowPointer = thread / ( A->getNumColumns() - colPointerMain );
		int colPointer = thread % ( A->getNumColumns() - colPointerMain ) + colPointerMain;

		if( colPointer > colPointerMain && colPointer < A->getNumColumns() &&
		rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() )
		{
		@@ -141,8 +143,8 @@ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A,
		}
		} else if( colPointer == colPointerMain && rowPointer == colPointerMain ) printf( "Error, pivot is zero!\n");
		}
		if( threadIdx.x == 0 && blockIdx.x % numBlocksOnRow == 0 && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() &&
		mainRow[ 0 ] != 0 && A->getElement( rowPointer, colPointerMain ) != 0 )
		if( colPointer == colPointerMain && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows()
		&& colPointer < A->getNumColumns() && mainRow[ 0 ] != 0 && A->getElement( rowPointer, colPointerMain ) != 0 )
		{
		b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * mainRow[ mainRow.getSize()-1 ] / mainRow[ 0 ];
		}

src/Benchmarks/GEM/tnl-gem-script

+20 −4

Original line number	Diff line number	Diff line
		#!/bin/bash
		device="both"
		precision="float"
		matice="matice6 662_bus"
		matice="662_bus comsol losmoc msc01050"
		loops=2

		runTest()
		@@ -15,10 +14,27 @@ runTest()

		echo ${matrix}.mtx
		./tnl-gem-cuda --input-matrix ${matrix}".mtx" \
		--device ${device} \
		--device GPU \
		--loops ${loops} \
		--precision ${precision} \
		>> ./results/${matrix}.txt
		>> ./results/${matrix}GPU.txt
		./tnl-gem-cuda --input-matrix ${matrix}".mtx" \
		--device GPU \
		--loops ${loops} \
		--precision ${precision} \
		--pivoting no \
		>> ./results/${matrix}GPU.txt
		./tnl-gem --input-matrix ${matrix}".mtx" \
		--device CPU \
		--loops ${loops} \
		--precision ${precision} \
		>> ./results/${matrix}CPU.txt
		./tnl-gem --input-matrix ${matrix}".mtx" \
		--device CPU \
		--loops ${loops} \
		--precision ${precision} \
		--pivoting no \
		>> ./results/${matrix}CPU.txt
		done
		}