Commit 9eb190f2 authored by Lukas Cejka's avatar Lukas Cejka Committed by Tomáš Oberhuber
Browse files

Removed useless comments in preparation for rebase.

parent cee27c2a
Loading
Loading
Loading
Loading
+7 −28
Original line number Diff line number Diff line
@@ -59,8 +59,6 @@ std::string getMatrixFormat( const Matrix& matrix )
    return format;
}

// This function is not used currently (as of 17.03.19),
//  as the log takes care of printing and saving this information into the log file.
// Print information about the matrix.
template< typename Matrix >
void printMatrixInfo( const Matrix& matrix,
@@ -218,14 +216,6 @@ benchmarkSpMV( Benchmark & benchmark,
    // Setup cuSPARSE MetaData, since it has the same header as CSR, 
    //  and therefore will not get its own headers (rows, cols, speedup etc.) in log.
    //      * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
    
    // FIXME: How to include benchmark with different name under the same header as the current format being benchmarked???
    // FIXME: Does it matter that speedup show difference only between current test and first test?
    //          Speedup shows difference between CPU and GPU-cuSPARSE, because in Benchmarks.h:
    //              * If there is no baseTime, the resulting test time is set to baseTime.
    //              * However, if there is a baseTime (from the CPU compared to GPU test),
    //                  baseTime isn't changed. If we change it in Benchmarks.h to compare 
    //                  the speedup from the last test, it will mess up BLAS benchmarks etc.
    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
          { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
          { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
@@ -244,7 +234,6 @@ benchmarkSpMV( Benchmark & benchmark,
    resultcuSPARSEDeviceVector2 = deviceVector2;
 #endif
    
//#ifdef COMPARE_RESULTS
    // Difference between GPU (curent format) and GPU-cuSPARSE results
    Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
    Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
@@ -274,15 +263,6 @@ benchmarkSpMV( Benchmark & benchmark,
    std::cout << GPUcuSparse_absMax << std::endl;
    std::cout << GPUcuSparse_lpNorm << std::endl;
    
    // FIXME: This isn't an elegant solution, it makes the log file very long.
//    benchmark.addErrorMessage( GPUcuSparse_absMax, 1 );
//    benchmark.addErrorMessage( GPUcuSparse_lpNorm, 1 );
    
//    benchmark.addErrorMessage( CPUxGPU_absMax, 1 );
//    benchmark.addErrorMessage( CPUxGPU_lpNorm, 1 );
    
//#endif
    
    std::cout << std::endl;
    return true;
}
@@ -295,15 +275,14 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
                        bool verboseMR )
{
   bool result = true;
   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
//   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
//   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
//   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
//   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
   
   // AdEllpack/BiEllpack doesn't have cross-device assignment ('= operator') implemented yet
   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
//   result |= benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
   
   // AdEllpack is broken
//   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
   result |= benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
   return result;
}

+4 −5
Original line number Diff line number Diff line
@@ -81,13 +81,12 @@ public:
            std::cout << "HEAD==TAIL" << std::endl;
        else
        {
            // TEST
            for( warpInfo< MatrixType >* i = this->getHead(); i != this->getTail()->next; i = i->next )
            {
                if( i == this->getHead() );
//                    std::cout << "Head:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
                else if( i == this->getTail() );
//                    std::cout << "Tail:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
                if( i == this->getHead() )
                    std::cout << "Head:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
                else if( i == this->getTail() )
                    std::cout << "Tail:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
                else
                    std::cout << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
            }
+7 −37
Original line number Diff line number Diff line
@@ -147,10 +147,6 @@ warpList< MatrixType >::~warpList()
        delete temp;
    }
    delete this->head;
    
    // TEST
//    std::cout << "List destructor." << std::endl;
//    this->printList();
}


@@ -1169,46 +1165,31 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,

    IndexType i = 0;
    IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
    // Save the value, to save calling access every loop.
    const IndexType warpLoad = this->localLoad[ warpIdx ];
    
    // The unroll factor is 4, therefore if a warp has less than 4 localLoad, it cannot be unrolled
    //  and must be calculated separately.
    if( warpLoad < 4 )
    {
        // While the helpful index of the warp localLoad is less than localLoad and the element index isn't
        //  out of the matrix (would return the number of columns of the matrix)
        while( i < warpLoad &&
               this->columnIndexes[ elementPtr ] < this->getColumns() )
        {
            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
            // For the current thread, shift the elements ptr by warpSize (to keep the thread on one row)
            elementPtr += this->warpSize;
            i++; // Increment the helpful localLoad index.
            i++;
        }
    }
    else // If the localLoad of the warp is unrollable.
    else
    {
        // Is the warpLoad divisible by 4 (4 - 1 for binary AND).
        //  This will return how far it is from being divisible:
        //  For 0 & 3 = 0; 1 & 3 = 1; 2 & 3 = 2; 3 & 3 = 3; 4 & 3 = 0, etc.
        IndexType alignUnroll = warpLoad & 3;
        
        // While the result of divisibility by 4 has not reached the point where it is divisble by 4.
        while( alignUnroll != 0 &&
               this->columnIndexes[ elementPtr ] < this->getColumns() )
        {        
                temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
                elementPtr += this->warpSize;
                i++;
                // If alignUnroll not 0 (i.e. no. of NNZ elements is not divisible by 4), decrement alignUnroll until it is.
                //  This will ensure that the i starting index with be incremented to the correct starting position for the unroll.
                alignUnroll--;
        }
    }

    // For those rows that have warpLoad < unroll factor, this for loop won't even get past the first condition.
    //  Otherwise unroll.
    for( ; i < this->localLoad[ warpIdx ]; i += 4 )
    {
        #pragma unroll
@@ -1222,7 +1203,6 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
        }
    }
    
    // What is going on here? DOCUMENT
    if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
    {
        IndexType end = ( warpIdx + 1 ) << 5;
@@ -1265,15 +1245,6 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
    IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
    const IndexType warpLoad = this->localLoad[ warpIdx ];
    
//    for( IndexType i = 0; i < warpLoad; i++ )
//    {
//        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
//        {
//            temp[ threadIdx.x ] += this->values[ elementPtr] * inVector[ this->columnIndexes[ elementPtr ] ];
//            elementPtr += this->warpSize;
//        }
//    }
    
    if( warpLoad < 8 )
    {
        while( i < warpLoad &&
@@ -1496,7 +1467,6 @@ public:
	InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
	OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
        TNL_CHECK_CUDA_DEVICE;
        std::cout << "totalLoad = " << matrix.totalLoad << std::endl;
	if( matrix.totalLoad < 2 )
	{
	    dim3 blockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
@@ -1520,7 +1490,7 @@ public:
	    Cuda::freeFromDevice( kernel_outVector );
	    TNL_CHECK_CUDA_DEVICE;
	}
	else if( matrix.totalLoad < 4 ) // WORKS
	else if( matrix.totalLoad < 4 )
	{
	    dim3 blockSize( 192 ), cudaGridSize( Cuda::getMaxGridSize() );
	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1543,7 +1513,7 @@ public:
	    Cuda::freeFromDevice( kernel_outVector );
	    TNL_CHECK_CUDA_DEVICE;
	}
	else if( matrix.totalLoad < 8 ) // Maybe works?
	else if( matrix.totalLoad < 8 )
	{
	    dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1566,7 +1536,7 @@ public:
	    Devices::Cuda::freeFromDevice( kernel_outVector );
	    TNL_CHECK_CUDA_DEVICE;
	}
	else if( matrix.totalLoad < 16 ) // BROKEN
	else if( matrix.totalLoad < 16 )
	{
	    dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1589,7 +1559,7 @@ public:
	    Cuda::freeFromDevice( kernel_outVector );
	    TNL_CHECK_CUDA_DEVICE;
	}
	else // BROKEN
	else
	{
	    dim3 blockSize( 96 ), cudaGridSize( Cuda::getMaxGridSize() );
	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1606,7 +1576,7 @@ public:
                                                       kernel_outVector,
                                                       gridIdx );
	    }
	    TNL_CHECK_CUDA_DEVICE; // FREEZES right here on CHECK CUDA
	    TNL_CHECK_CUDA_DEVICE;
	    Devices::Cuda::freeFromDevice( kernel_this );
	    Devices::Cuda::freeFromDevice( kernel_inVector );
	    Devices::Cuda::freeFromDevice( kernel_outVector );
+3 −4
Original line number Diff line number Diff line
@@ -802,7 +802,6 @@ template< typename Real,
void BiEllpack< Real, Device, Index >::printValues() const
{
    for( Index i = 0; i < this->values.getSize(); i++ ) {
    // Random values are stored with the column index of getColumns(). e.g. a matrix has 4 columns, values are at column indexes 0, 1, 2, 3 and junk data at index 4.
        if( this->columnIndexes.getElement( i ) != this->getColumns() )
            std::cout << "values.getElement( " << i << " ) = " << this->values.getElement( i ) 
             << "\tcolumnIndexes.getElement( " << i << " ) = " << this->columnIndexes.getElement( i ) << std::endl;
+2 −42
Original line number Diff line number Diff line
@@ -124,41 +124,8 @@ template< typename Real,
Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const
{
    // TODO: Fix/Implement
    throw Exceptions::NotImplementedError( "CSR::getNonZeroRowLength is not implemented." );
//    if( std::is_same< DeviceType, Devices::Host >::value )
//    {
//       ConstMatrixRow matrixRow = this->getRow( row );
//       return matrixRow.getNonZeroElementsCount();
//    }
//    if( std::is_same< DeviceType, Devices::Cuda >::value )
//    {
//       IndexType *cols = new IndexType[4];
//       RealType *vals = new RealType[4];
//       for( int i = 0; i < 4; i++ )
//       {
//           cols[i] = i;
//           vals[i] = 1.0;
//       }
//       ConstMatrixRow matrixRow(cols, vals, 4, 1);
// //      ConstMatrixRow matrixRow = this->getRow( row );// If the program even compiles, this line fails because a segfault is thrown on the first line of getRow()
//       // WHEN debugging with GDB:
//       //  (gdb) p this->rowPointers[0]
//       //    Could not find operator[].
//       //  (gdb) p rowPointers.getElement(0)
//       //    Attempt to take address of value not located in memory.
//       IndexType resultHost ( 0 );
//       IndexType* resultCuda = Cuda::passToDevice( resultHost );
//       // PROBLEM: If the second parameter of getNonZeroRowLengthCudaKernel is '&resultCuda', the following issue is thrown:
//       //          'error: no instance of function template "TNL::Matrices::getNonZeroRowLengthCudaKernel" matches the argument list'
//       TNL::Matrices::getNonZeroRowLengthCudaKernel< ConstMatrixRow, IndexType ><<< 1, 1 >>>( matrixRow, resultCuda ); // matrixRow works fine, tested them both separately
//       delete []cols;
//       delete []vals;
//       std::cout << "Checkpoint BEFORE passFromDevice" << std::endl;
//       resultHost = Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address in Cuda_impl.h at TNL_CHECK_CUDA_DEVICE
//       std::cout << "Checkpoint AFTER passFromDevice" << std::endl;
//       Cuda::freeFromDevice( resultCuda );
//       return resultHost;
//   }
    TNL_ASSERT( false, std::cerr << "TODO: Fix/Implement" );
    return 0;
}

template< typename Real,
@@ -223,13 +190,6 @@ bool CSR< Real, Device, Index >::addElementFast( const IndexType row,
                                                          const RealType& value,
                                                          const RealType& thisElementMultiplicator )
{
   /*TNL_ASSERT( row >= 0 && row < this->rows &&
              column >= 0 && column <= this->rows,
              std::cerr << " row = " << row
                   << " column = " << column
                   << " this->rows = " << this->rows
                   << " this->columns = " << this-> columns );*/

   IndexType elementPtr = this->rowPointers[ row ];
   const IndexType rowEnd = this->rowPointers[ row + 1 ];
   IndexType col = 0;
Loading