Commit 8254fd03 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Fixes after rebase - it works even with CUDA.

parent 496bacdd
Loading
Loading
Loading
Loading
+8 −4
Original line number Diff line number Diff line
@@ -232,8 +232,10 @@ benchmarkSpMV( Benchmark& benchmark,
    resultcuSPARSEDeviceVector2 = deviceVector2;
    
    // Difference between GPU (curent format) and GPU-cuSPARSE results
    Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
    Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
    //Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
    Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) );
    //Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
    Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 );
    
    std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax );
    std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm );
@@ -243,8 +245,10 @@ benchmarkSpMV( Benchmark& benchmark,
    
    
    // Difference between CPU and GPU results for the current format
    Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
    Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
    //Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
    Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) );
    //Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
    Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 );
    
    std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax );
    std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm );
+16 −16
Original line number Diff line number Diff line
@@ -1064,7 +1064,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
                                                           OutVector& outVector,
                                                           const int gridIdx ) const
{
    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    IndexType warpIdx = globalIdx >> 5;
    IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
    if( globalIdx >= this->reduceMap.getSize() )
@@ -1129,14 +1129,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
                                                           OutVector& outVector,
                                                           const int gridIdx ) const
{
    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    IndexType warpIdx = globalIdx >> 5;
    IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
    if( globalIdx >= this->reduceMap.getSize() )
        return;

    const int blockSize = 128;
    Real* temp = Devices::Cuda::getSharedMemory< Real >();
    Real* temp = Cuda::getSharedMemory< Real >();
    __shared__ IndexType reduceMap[ blockSize ];
    reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
    temp[ threadIdx.x ] = 0.0;
@@ -1207,14 +1207,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
                                                         OutVector& outVector,
                                                         const int gridIdx ) const
{
    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    IndexType warpIdx = globalIdx >> 5;
    IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
    if( globalIdx >= this->reduceMap.getSize() )
        return;

    const int blockSize = 128;
    Real* temp = Devices::Cuda::getSharedMemory< Real >();
    Real* temp = Cuda::getSharedMemory< Real >();
    __shared__ IndexType reduceMap[ blockSize ];
    reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
    temp[ threadIdx.x ] = 0.0;
@@ -1293,7 +1293,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
	return;

    const int blockSize = 96;
    Real* temp = Devices::Cuda::getSharedMemory< Real >();
    Real* temp = Cuda::getSharedMemory< Real >();
    __shared__ IndexType reduceMap[ blockSize ];
    reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
    temp[ threadIdx.x ] = 0.0;
@@ -1441,9 +1441,9 @@ public:
#ifdef HAVE_CUDA
      typedef AdEllpack< Real, Devices::Cuda, Index > Matrix;
      typedef typename Matrix::IndexType IndexType;
	   Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
	   InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
	   OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
	   Matrix* kernel_this = Cuda::passToDevice( matrix );
	   InVector* kernel_inVector = Cuda::passToDevice( inVector );
	   OutVector* kernel_outVector = Cuda::passToDevice( outVector );
      TNL_CHECK_CUDA_DEVICE;

      if( matrix.totalLoad < 2 )
@@ -1510,16 +1510,16 @@ public:
                                                      gridIdx );
	    }
	    TNL_CHECK_CUDA_DEVICE;
	    Devices::Cuda::freeFromDevice( kernel_this );
	    Devices::Cuda::freeFromDevice( kernel_inVector );
	    Devices::Cuda::freeFromDevice( kernel_outVector );
	    Cuda::freeFromDevice( kernel_this );
	    Cuda::freeFromDevice( kernel_inVector );
	    Cuda::freeFromDevice( kernel_outVector );
	    TNL_CHECK_CUDA_DEVICE;
	}
	else if( matrix.totalLoad < 16 )
	{
	    dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
            for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
	    {
	        if( gridIdx == cudaGrids - 1 )
@@ -1556,9 +1556,9 @@ public:
                                                       gridIdx );
	    }
	    TNL_CHECK_CUDA_DEVICE;
	    Devices::Cuda::freeFromDevice( kernel_this );
	    Devices::Cuda::freeFromDevice( kernel_inVector );
	    Devices::Cuda::freeFromDevice( kernel_outVector );
	    Cuda::freeFromDevice( kernel_this );
	    Cuda::freeFromDevice( kernel_inVector );
	    Cuda::freeFromDevice( kernel_outVector );
	    TNL_CHECK_CUDA_DEVICE;
	}
#endif // HAVE_CUDA
+2 −2
Original line number Diff line number Diff line
@@ -1406,7 +1406,7 @@ public:
		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		     if( gridIdx == cudaGrids - 1 )
		         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
		         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
		     performRowBubbleSortCuda< Real, Index >
		     	 	 	 	 	 	 <<< cudaGridSize, cudaBlockSize >>>
		                             ( kernel_this,
@@ -1436,7 +1436,7 @@ public:
		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		     if( gridIdx == cudaGrids - 1 )
		         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
		         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
		     computeColumnSizesCuda< Real, Index >
		     	 	 	 	 	   <<< cudaGridSize, cudaBlockSize >>>
		                           ( kernel_this,
+2 −2
Original line number Diff line number Diff line
@@ -1230,8 +1230,8 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
   
   // host -> cuda
   if( std::is_same< Device, Devices::Cuda >::value ) {
       typename ValuesVector::HostType tmpValues;
       typename ColumnIndexesVector::HostType tmpColumnIndexes;
       typename ValuesVector::Self< typename ValuesVector::RealType, Devices::Host > tmpValues;
       typename ColumnIndexesVector::Self< typename ColumnIndexesVector::RealType, Devices::Host > tmpColumnIndexes;
       tmpValues.setLike( matrix.values );
       tmpColumnIndexes.setLike( matrix.columnIndexes );