Debugging Adaptive RgCSR format. (d0a48c82) · Commits · TNL / tnl-dev

src/matrix/tnlAdaptiveRgCSRMatrix.h

+75 −16

Original line number	Diff line number	Diff line
		@@ -77,8 +77,7 @@ class tnlAdaptiveRgCSRMatrix : public tnlMatrix< Real, Device, Index >

		Index getArtificialZeroElements() const;

		Real getElement( Index row, Index column ) const
		{ abort(); };
		Real getElement( Index row, Index column ) const;

		bool setElement( Index row,
		Index colum,
		@@ -138,9 +137,10 @@ class tnlAdaptiveRgCSRMatrix : public tnlMatrix< Real, Device, Index >

		tnlLongVector< Index, tnlHost, Index > usedThreadsInGroup;


		tnlLongVector< Index, Device, Index > rowToGroupMapping;

		Index maxGroupSize, groupSizeStep;

		Index targetNonzeroesPerGroup;

		Index numberOfGroups;
		@@ -181,6 +181,7 @@ tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: tnlAdaptiveRgCSRMatrix( const t
		threads( name + " : threads" ),
		groupInfo( name + ": groupInfo" ),
		usedThreadsInGroup( name + " : usedThreadsInGroup" ),
		rowToGroupMapping( name + " : rowToGroupMapping" ),
		maxGroupSize( _maxGroupSize ),
		groupSizeStep(_groupSizeStep),
		targetNonzeroesPerGroup(_targetNonzeroesPerGroup),
		@@ -228,10 +229,12 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: setSize( Index new_size )
		this -> size = new_size;
		if( ! groupInfo. setSize( this -> getSize()) \|\|
		! usedThreadsInGroup. setSize( this -> getSize() ) \|\|
		! threads. setSize( this -> getSize() ) )
		! threads. setSize( this -> getSize() ) \|\|
		! rowToGroupMapping. setSize( this -> getSize() ) )
		return false;
		threads. setValue( 0 );
		usedThreadsInGroup. setValue( 0 );
		rowToGroupMapping. setValue( 0 );
		last_nonzero_element = 0;
		return true;
		};
		@@ -338,7 +341,7 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlCSRMatr
		threads[ i ] = threads[ i - 1 ] + threadsPerRow[ i - groupBegin - 1 ];
		dbgExpr( threads[ i ] );
		}
		usedThreadsInGroup[ groupId ] = threads[ groupEnd - groupBegin - 1 ]; // ???????/
		usedThreadsInGroup[ groupId ] = threads[ groupEnd - groupBegin - 1 ]; // ???????
		dbgExpr( usedThreadsInGroup[ groupId ] );

		/****
		@@ -368,6 +371,9 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlCSRMatr
		dbgExpr( groupInfo[ groupId ]. idxFirstValue );
		dbgExpr( groupInfo[ groupId ]. numRounds );

		for( Index i = groupBegin; i < groupEnd; i ++ )
		rowToGroupMapping[ i ] = groupId;

		groupId++;
		numberOfStoredValues += cudaBlockSize * roundsFinal;
		groupBegin = groupEnd;
		@@ -399,6 +405,8 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlCSRMatr
		{
		baseRow = groupInfo[ i ]. idxFirstRow;
		index = groupInfo[ i ]. idxFirstValue;
		dbgExpr( baseRow );
		dbgExpr( index );
		/****
		* First compute number of threads for each row.
		*/
		@@ -422,11 +430,13 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlCSRMatr
		if( counters[ j ] < NZperRow[ j ] )
		{
		Index pos = mat. row_offsets[ baseRow + j ] + counters[ j ];
		dbgCout( "Inserting data from CSR format at position " << pos << " to AdaptiveRgCSR at " << index );
		nonzeroElements[ index ] = mat. nonzero_elements[ pos ];
		columns[ index ] = mat.columns[ pos ];
		}
		else
		{
		dbgCout( "Inserting artificial zero to AdaptiveRgCSR at " << index );
		columns[ index ] = -1;
		nonzeroElements[ index ] = 0.0;
		}
		@@ -479,6 +489,42 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlAdaptiv
		return true;
		};

		template< typename Real, tnlDevice Device, typename Index >
		Real tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: getElement( Index row,
		Index column ) const
		{
		tnlAssert( 0 <= row && row < this -> getSize(),
		cerr << "The row is outside the matrix." );
		if( Device == tnlHost )
		{
		Index groupId = rowToGroupMapping[ row ];
		Index groupRow = row - groupInfo[ groupId ]. idxFirstRow;
		Index groupOffset = groupInfo[ groupId ]. idxFirstValue;
		Index firstThread = threads[ row ];
		Index lastThread = threads[ row + 1 ];
		/****
		* If it is the last row in a group the lastThread is taken from usedThraeds
		*/
		if( row + 1 - groupInfo[ groupId ]. idxFirstRow == groupInfo[ groupId ]. numRows )
		lastThread = usedThreadsInGroup[ groupId ];

		/for( Index i = firstThread groupInfo[ groupId ]. numRounds;
		i < lastThread * groupInfo[ groupId ] * numRounds;
		i ++ )
		if( columns[ i ] == column )
		return 0; //????????????????????
		*/

		}
		if( Device == tnlCuda )
		{
		tnlAssert( false,
		cerr << "tnlRgCSRMatrix< Real, tnlCuda, Index > ::getElement is not implemented yet." );
		//TODO: implement this

		}
		}

		template< typename Real, tnlDevice Device, typename Index >
		void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLongVector< Real, Device, Index >& vec,
		tnlLongVector< Real, Device, Index >& result ) const
		@@ -503,11 +549,17 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo
		Index limits[MAX_ROWS + 1]; //indices of first threads for each row + index of first unused thread
		Real results[MAX_ROWS];

		for( Index group = 0; group < this -> numberOfGroups; group ++ ) //for each group of rows
		/****
		* Go over all groups ...
		*/
		for( Index group = 0; group < this -> numberOfGroups; group ++ )
		{
		/****
		* In each group compute partial sums of each thread
		*/
		for( Index thread = 0;
		thread < this -> usedThreadsInGroup[ group ];
		thread ++ ) //for each used thread in group
		thread ++ )
		{
		idx[ thread ] = this -> groupInfo[ group ]. idxFirstValue + thread;
		psum[thread] = 0;
		@@ -521,11 +573,18 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo
		}
		}

		/****
		* Compute local copy of thread indexes mapped to given row of the group.
		* (this is only to simulate copying data to the fast shared memory on GPU)
		*/
		for( Index thread = 0;
		thread < this -> groupInfo[ group ]. numRows;
		thread ++ ) //for threads corresponding to rows in group
		limits[ thread ] = this -> threads[ this -> groupInfo[ group ]. idxFirstRow + thread ]; //make a local copy of info about threads
		limits[ this -> groupInfo[ group ]. numRows ] = this -> usedThreadsInGroup[ group ]; //for convenience, add the index of first unused row
		thread ++ )
		limits[ thread ] = this -> threads[ this -> groupInfo[ group ]. idxFirstRow + thread ];
		/****
		* For convenience, add the index of first unused row.
		*/
		limits[ this -> groupInfo[ group ]. numRows ] = this -> usedThreadsInGroup[ group ];

		//reduction of partial sums and writing to the output
		for( Index thread = 0;
		@@ -555,13 +614,13 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo
		int gridSize = (int) desGridSize;
		dim3 gridDim( gridSize ), blockDim( blockSize );

		AdaptiveRgCSRMatrixVectorProductKernel< Real, Index, false ><<< gridDim, blockDim >>>( result. getVector(),
		/*AdaptiveRgCSRMatrixVectorProductKernel< Real, Index, false ><<< gridDim, blockDim >>>( result. getVector(),
		vec. getVector(),
		nonzero_elements. getVector(),
		nonzeroElements. getVector(),
		columns. getVector(),
		block_info. getVector(),
		threads_per_row. getVector(),
		number_of_groups );
		groupInfo. getVector(),
		threadsPerRow. getVector(),
		numberOfGroups );*/
		cudaThreadSynchronize();
		CHECK_CUDA_ERROR;
		#else

src/matrix/tnlRgCSRMatrix.h

+5 −3

Original line number	Diff line number	Diff line
		@@ -456,6 +456,7 @@ template< typename Real, tnlDevice Device, typename Index >
		bool tnlRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlRgCSRMatrix< Real, Device2, Index >& rgCSRMatrix )
		{
		dbgFunctionName( "tnlRgCSRMatrix< Real, Device, Index >", "copyFrom" );

		groupSize = rgCSRMatrix. getGroupSize();
		if( ! this -> setSize( rgCSRMatrix. getSize() ) )
		return false;
		@@ -635,7 +636,7 @@ void tnlRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLongVector
		int gridSize = size / blockSize + ( size % blockSize != 0 ) + 1;
		dim3 gridDim( gridSize ), blockDim( blockSize );
		if( useCache )
		sparseOldCSRMatrixVectorProductKernel< Real, Index, true ><<< gridDim, blockDim >>>( size,
		sparseCSRMatrixVectorProductKernel< Real, Index, true ><<< gridDim, blockDim >>>( size,
		this -> getGroupSize(),
		nonzeroElements. getVector(),
		columns. getVector(),
		@@ -644,7 +645,7 @@ void tnlRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLongVector
		vec. getVector(),
		result. getVector() );
		else
		sparseOldCSRMatrixVectorProductKernel< Real, Index, false ><<< gridDim, blockDim >>>( size,
		sparseCSRMatrixVectorProductKernel< Real, Index, false ><<< gridDim, blockDim >>>( size,
		this -> getGroupSize(),
		nonzeroElements. getVector(),
		columns. getVector(),
		@@ -653,6 +654,7 @@ void tnlRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLongVector
		vec. getVector(),
		result. getVector() );
		cudaThreadSynchronize();
		if( useCache )
		unbindRgCSRMatrixCUDATexture( vec. getVector() );
		CHECK_CUDA_ERROR;
		#else

tests/Makefile.am

+11 −11

Original line number	Diff line number	Diff line
		@@ -18,15 +18,15 @@ matrix_solvers_benchmark_sources = matrix-solvers-benchmark.h
		tnl_benchmarks_sources = tnl-benchmarks.h

		#if BUILD_CUDA
		#matrix_formats_test_sources += matrix-formats-test-cuda.cu
		#sparse_matrix_benchmark_sources += sparse-matrix-benchmark-cuda.cu
		#matrix_solvers_benchmark_sources += matrix-solvers-benchmark-cuda.cu
		#tnl_benchmarks_sources += tnl-benchmarks-cuda.cu
		matrix_formats_test_sources += matrix-formats-test-cuda.cu
		sparse_matrix_benchmark_sources += sparse-matrix-benchmark-cuda.cu
		matrix_solvers_benchmark_sources += matrix-solvers-benchmark-cuda.cu
		tnl_benchmarks_sources += tnl-benchmarks-cuda.cu
		#else
		matrix_formats_test_sources += matrix-formats-test.cpp
		sparse_matrix_benchmark_sources += sparse-matrix-benchmark.cpp
		matrix_solvers_benchmark_sources += matrix-solvers-benchmark.cpp
		tnl_benchmarks_sources += tnl-benchmarks.cpp
		#matrix_formats_test_sources += matrix-formats-test.cpp
		#sparse_matrix_benchmark_sources += sparse-matrix-benchmark.cpp
		#matrix_solvers_benchmark_sources += matrix-solvers-benchmark.cpp
		#tnl_benchmarks_sources += tnl-benchmarks.cpp
		#endif

		matrix_formats_test_SOURCES = $(matrix_formats_test_sources)
		@@ -77,9 +77,9 @@ matrix_solvers_benchmark_dbg_LDADD = ../src/libtnl-0.1.la \
		endif

		#if BUILD_CUDA
		#matrix_formats_test_CXXFLAGS = -DHAVE_CUDA
		#sparse_matrix_benchmark_CXXFLAGS += -DHAVE_CUDA
		#tnl_benchmarks_CXXFLAGS = -DHAVE_CUDA
		matrix_formats_test_CXXFLAGS = -DHAVE_CUDA
		sparse_matrix_benchmark_CXXFLAGS += -DHAVE_CUDA
		tnl_benchmarks_CXXFLAGS = -DHAVE_CUDA
		#endif

tests/Makefile.in

+142 −86

File changed.

Preview size limit exceeded, changes collapsed.

tests/convert-matrices

+0 −28

Original line number	Diff line number	Diff line
		@@ -120,34 +120,6 @@ do
		echo "Matrix $file.double.bin.bz2 was already converted."
		fi

		#######
		## Descend ordering
		###
		if test ! -e $file.descend.float.bin.bz2;
		then
		echo "Converting matrix $file.descend.float.bin.bz2 ..."
		if test ! -e $file.descend;
		then
		./reorder-csr -D -g 32 -i $file -o $file.descend
		fi
		$TNL_MATRIX_CONVERT --input-file $file.descend --output-file $file.descend.float.bin.bz2 --precision float --verbose yes --verify no
		#rm $file.descend
		else
		echo "Matrix $file.descend.float.bin.bz2 was already converted."
		fi
		if test ! -e $file.descend.double.bin.bz2;
		then
		echo "Converting matrix $file.descend.double.bin.bz2 ..."
		if test ! -e $file.descend;
		then
		./reorder-csr -D -g 32 -i $file -o $file.descend
		fi
		$TNL_MATRIX_CONVERT --input-file $file.descend --output-file $file.descend.double.bin.bz2 --precision double --verbose yes --verify no
		#rm $file.descend
		else
		echo "Matrix $file.descend.double.bin.bz2 was already converted."
		fi

		#######
		## AMD ordering
		###