Parallel reduction repaired, optimalized for pivoting GPU->CPU copy. Text: seq GEM, paral GEM (a03b09c1) · Commits · TNL / tnl-dev

src/Benchmarks/GEM/Matrix/Matrix_impl.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -19,7 +19,7 @@ template < typename Real,
		typename Index >
		void Matrix< Real, Device, Index >::setDimensions( const Index rows, const Index columns )
		{
		TNL_ASSERT( rows > 0 && columns > 0, std::cerr << "Matrix cannot have zero rows nor columns!");
		TNL_ASSERT( rows > 0 && columns > 0, std::cerr << "Matrix cannot have zero rows nor columns!\n");
		this->rows = rows;
		this->columns = columns;

src/Benchmarks/GEM/gem.h

+3 −3

Original line number	Diff line number	Diff line
		@@ -112,11 +112,11 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		{
		typedef Matrix< Real, Devices::Host, Index> MatrixHost;
		MatrixHost matrixHost;
		// Get matrix
		Matrices::MatrixReader< MatrixHost > m;
		m.readMtxFile( "./test-matrices/" + matrixName, matrixHost );
		if( verbose > 1 )
		cout << "reading matrix " << matrixName << endl;
		// Get matrix
		Matrices::MatrixReader< MatrixHost > m;
		m.readMtxFile( "./test-matrices/" + matrixName, matrixHost, verbose );
		if( verbose > 2 )
		matrixHost.showMatrix();
		matrix = matrixHost;

src/Benchmarks/GEM/gem/GEM_impl.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -144,7 +144,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose )
		if( verbose > 1 )
		{
		std::cout << std::endl;
		std::cout << "Choosing element at " << pivotPosition << "-th row as pivot..." << std::endl;
		std::cout << "Choosing element at " << pivotPosition << "-th row as pivot with value " << this->A.getElement(k,k)<<"..." << std::endl;
		std::cout << "Swapping " << k << "-th and " << pivotPosition << "-th rows ... " << std::endl;
		}

src/Benchmarks/GEM/gem/GEMdevice.h

+7 −6

Original line number	Diff line number	Diff line
		@@ -57,14 +57,15 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		if( pivoting == "yes" )
		{
		// PIVOTING
		int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 512 ? 512 :
		int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 :
		TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 );
		int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize );
		int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );
		//printf("%d, %d, %d\n", reduceBlockSize, reduceGridSize, reduceGridSizeRound );

		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSizeRound);
		TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSizeRound);
		outMax.setValue(0); outPos.setValue(0);
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSize);
		TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSize);
		//outMax.setValue(0); outPos.setValue(0);

		findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, colPointer, outMax.getView(), outPos.getView() );
		cudaDeviceSynchronize();
		@@ -83,12 +84,12 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		int numOfBlocks = this->A.getNumRows() * numBlocksOnRow;


		if( pivoting == "yes" && pom != -1 && pom != colPointer )
		if( pivoting == "yes" )// && pom != -1 && pom != colPointer )
		{
		if( verbose > 1 )
		{
		std::cout << std::endl;
		std::cout << "Choosing element at " << *pom << "-th row as pivot..." << std::endl;
		std::cout << "Choosing element at " << *pom << "-th row as pivot with value..." << std::endl;
		std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl;
		}
		swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );

src/Benchmarks/GEM/gem/GEMkernels.h

+25 −54

Original line number	Diff line number	Diff line
		@@ -25,18 +25,24 @@ __inline__ __device__ void warpReduceArgMax(Real& val, int& index) {
		template <typename Real >
		__inline__ __device__ void blockReduceArgMax(Real& val, int& index)
		{
		static __shared__ Real shared[32]; // Shared mem for 32 partial sums
		static __shared__ Real sharedVal[32]; // Shared mem for 32 partial reduction
		static __shared__ Real sharedIndex[32]; // Shared mem for 32 partial reduction
		int lane = threadIdx.x % 32;
		int wid = threadIdx.x / 32;

		warpReduceArgMax( val, index); // Each warp performs partial reduction

		if (lane==0) shared[wid]=val; // Write reduced value to shared memory
		if (lane==0)
		{
		sharedVal[wid]=val; // Write reduced value to shared memory
		sharedIndex[wid]=index; // Write reduced value to shared memory
		}

		__syncthreads(); // Wait for all partial reductions

		//read from shared memory only if that warp existed
		val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
		val = (threadIdx.x < blockDim.x / 32) ? sharedVal[lane] : 0;
		index = (threadIdx.x < blockDim.x / 32) ? sharedIndex[lane] : 0;

		if (wid==0)
		{
		@@ -63,23 +69,9 @@ void findPivot( Matrix< Real, TNL::Devices::Cuda, int >* A,
		blockReduceArgMax( firstElementInRow, index );
		if( threadIdx.x == 0 )
		{
		//printf("%d: %.2f\n", index, firstElementInRow );
		outMaximum[blockIdx.x] = firstElementInRow;
		outPosition[blockIdx.x] = index;
		}


		//if( rowPointer < A->getNumRows() && rowPointer >= colPointerMain )
		{
		//int pom = __float_as_int( (float)TNL::abs( A->getElement( rowPointer, colPointerMain ) ) );
		/*if( TNL::abs( A->getElement(rowPointer, colPointerMain ) ) > 1 )
		printf("%d: %d, %.4f\n", rowPointer, pom, TNL::abs( A->getElement(rowPointer, colPointerMain ) ) );*/
		//atomicMax( Maximum, pom );
		}


		//if( threadIdx.x == 0 && blockIdx.x == 0 )
		// printf("%.4f %d\n", firstElementInRow, index );
		}


		@@ -88,36 +80,14 @@ __global__
		void findRowPivot( TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum,
		TNL::Containers::VectorView< int, TNL::Devices::Cuda, int > outPosition, int* positionPivot )
		{
		/*if( threadIdx.x == 0 )
		{
		printf("outMax = [ ");
		for( int i = 0; i < blockDim.x; i++ )
		printf("%.2f, ", outMaximum[i] );
		printf("]\n");

		printf("outPos = [ ");
		for( int i = 0; i < blockDim.x; i++ )
		printf("%d, ", outPosition[i] );
		printf("]\n");
		}*/

		int rowPointer = threadIdx.x;
		Real firstElementInRow = rowPointer >= blockDim.x ? 0 : outMaximum[ rowPointer ];
		int index = outPosition[ rowPointer ];
		Real firstElementInRow = rowPointer >= outMaximum.getSize() ? 0 : outMaximum[ rowPointer ];
		int index = rowPointer >= outPosition.getSize() ? 0 : outPosition[ rowPointer ];
		blockReduceArgMax( firstElementInRow, index );
		//printf("%d: %.2f\n", index, firstElementInRow );
		if( threadIdx.x == 0 )
		{
		*positionPivot = index;
		}


		/int rowPointer = threadIdx.x + blockDim.x (blockIdx.x % numBlocksOnRow) + colPointerMain;
		if( rowPointer >= colPointerMain && rowPointer < A->getNumRows() &&
		__float_as_int( (float)TNL::abs( A->getElement( rowPointer, colPointerMain ) ) ) == *Maximum )
		{
		atomicExch( positionPivot, rowPointer);
		}*/
		}


		@@ -127,8 +97,8 @@ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A,
		TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
		int colPointerMain, int numBlocksOnRow, int* positionPivot )
		{
		//if( threadIdx.x == 0 && blockIdx.x == 0 )
		// printf("\nChoosing element at %d-th row as pivot...\nSwapping %d-th and %d-th rows...\n",positionPivot,colPointerMain,positionPivot );
		if( *positionPivot > colPointerMain )
		{
		int rowPointer1 = colPointerMain;
		int rowPointer2 = *positionPivot;
		int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
		@@ -145,6 +115,7 @@ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A,
		}
		}
		}
		}


		template <typename Real >