Commit d5668ae4 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing SlicedEllpack format in CUDA.

parent e0af631b
Loading
Loading
Loading
Loading
+9 −8
Original line number Diff line number Diff line
@@ -224,11 +224,11 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType,
                                   input,
                                   output,
                                   auxArray1 );
   cudaError error = cudaGetLastError();
   if( error != cudaSuccess )
   if( ! checkCudaDevice )
   {
      cerr << "The CUDA kernel 'cudaFirstPhaseBlockPrefixSum' ended with error code " << error << "."
           << endl;
      cerr << "The CUDA kernel 'cudaFirstPhaseBlockPrefixSum' ended with error." << endl;
      cudaFree( auxArray1 );
      cudaFree( auxArray2 );
      return false;
   }

@@ -251,11 +251,12 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType,
   cudaSecondPhaseBlockPrefixSum< DataType, Operation, Index >
                                <<< cudaGridSize, cudaBlockSize >>>
                                 ( operation, size, elementsInBlock, gridShift, auxArray2, output );
   error = cudaGetLastError();
   if( error != cudaSuccess )

   if( ! checkCudaDevice )
   {
      cerr << "The CUDA kernel 'cudaSecondPhaseBlockPrefixSum' ended with error code " << error << "."
           << endl;
      cerr << "The CUDA kernel 'cudaSecondPhaseBlockPrefixSum' ended with error." << endl;
      cudaFree( auxArray1 );
      cudaFree( auxArray2 );
      return false;
   }
   cudaFree( auxArray1 );
+189 −94
Original line number Diff line number Diff line
@@ -76,18 +76,11 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setRowLengths( co
       ! this->slicePointers.setSize( slices + 1 ) )
      return false;

   /****
    * Compute maximal row length in each slice
    */
   DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths );

   /****
    * Compute the slice pointers using the exclusive prefix sum
    */
   this->slicePointers.setElement( slices, 0 );
   this->slicePointers.computeExclusivePrefixSum();

   return this->allocateMatrixElements( this->slicePointers[ slices ] );
   return this->allocateMatrixElements( this->slicePointers.getElement( slices ) );
}

template< typename Real,
@@ -170,7 +163,7 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setElementFast( c
                                                                               const IndexType column,
                                                                               const Real& value )
{
   return this->addElement( row, column, value, 0.0 );
   return this->addElementFast( row, column, value, 0.0 );
}

template< typename Real,
@@ -204,12 +197,10 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::addElementFast( c
                   << " this->rows = " << this->rows
                   << " this->columns = " << this-> columns );

   const IndexType sliceIdx = row / SliceSize;
   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
   IndexType elementPtr = this->slicePointers[ sliceIdx ] +
                          rowLength * ( row - sliceIdx * SliceSize );
   const IndexType rowEnd( elementPtr + rowLength );
   while( elementPtr < rowEnd && this->columnIndexes[ elementPtr ] < column ) elementPtr++;
   Index elementPtr, rowEnd, step;
   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );

   while( elementPtr < rowEnd && this->columnIndexes[ elementPtr ] < column ) elementPtr += step;
   if( elementPtr == rowEnd )
      return false;
   if( this->columnIndexes[ elementPtr ] == column )
@@ -226,12 +217,12 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::addElementFast( c
      }
      else
      {
         IndexType j = rowEnd - 1;
         IndexType j = rowEnd - step;
         while( j > elementPtr )
         {
            this->columnIndexes[ j ] = this->columnIndexes[ j - 1 ];
            this->values[ j ] = this->values[ j - 1 ];
            j--;
            this->columnIndexes[ j ] = this->columnIndexes[ j - step ];
            this->values[ j ] = this->values[ j - step ];
            j -= step;
         }
         this->columnIndexes[ elementPtr ] = column;
         this->values[ elementPtr ] = value;
@@ -249,7 +240,45 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::addElement( const
                                                                           const RealType& value,
                                                                           const RealType& thisElementMultiplicator )
{
   return this->addElementFast( row, column, value, thisElementMultiplicator );
   tnlAssert( row >= 0 && row < this->rows &&
              column >= 0 && column <= this->rows,
              cerr << " row = " << row
                   << " column = " << column
                   << " this->rows = " << this->rows
                   << " this->columns = " << this-> columns );

   Index elementPtr, rowEnd, step;
   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );

   while( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) < column ) elementPtr += step;
   if( elementPtr == rowEnd )
      return false;
   if( this->columnIndexes.getElement( elementPtr ) == column )
   {
      this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value );
      return true;
   }
   else
      if( this->columnIndexes.getElement( elementPtr ) == this->columns )
      {
         this->columnIndexes.setElement( elementPtr, column );
         this->values.setElement( elementPtr, value );
         return true;
      }
      else
      {
         IndexType j = rowEnd - step;
         while( j > elementPtr )
         {
            this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) );
            this->values.setElement( j, this->values.getElement( j - step ) );
            j -= step;
         }
         this->columnIndexes.setElement( elementPtr, column );
         this->values.setElement( elementPtr, value );
         return true;
      }
   return false;
}

template< typename Real,
@@ -269,16 +298,20 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > :: setRowFast( con
   if( elements > rowLength )
      return false;

   IndexType elementPointer = this->slicePointers[ sliceIdx ] +
                              rowLength * ( row - sliceIdx * SliceSize );
   Index elementPointer, rowEnd, step;
   DeviceDependentCode::initRowTraverseFast( *this, row, elementPointer, rowEnd, step );

   for( IndexType i = 0; i < elements; i++ )
   {
      this->columnIndexes[ elementPointer ] = columnIndexes[ i ];
      this->values[ elementPointer ] = values[ i ];
      elementPointer++;
      elementPointer += step;
   }
   for( IndexType i = elements; i < rowLength; i++ )
      this->columnIndexes[ elementPointer++ ] = this->getColumns();
   {
      this->columnIndexes[ elementPointer ] = this->getColumns();
      elementPointer += step;
   }
   return true;
}

@@ -291,7 +324,26 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > :: setRow( const I
                                                                         const RealType* values,
                                                                         const IndexType elements )
{
   return this->setRowFast( row, columnIndexes, values, elements );
   const IndexType sliceIdx = row / SliceSize;
   const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx );
   if( elements > rowLength )
      return false;

   Index elementPointer, rowEnd, step;
   DeviceDependentCode::initRowTraverseFast( *this, row, elementPointer, rowEnd, step );

   for( IndexType i = 0; i < elements; i++ )
   {
      this->columnIndexes.setElement( elementPointer, this->columnIndexes.getElement( i ) );
      this->values.setElement( elementPointer, this->values.getElement( i ) );
      elementPointer += step;
   }
   for( IndexType i = elements; i < rowLength; i++ )
   {
      this->columnIndexes.setElement( elementPointer, this->getColumns() );
      elementPointer += step;
   }
   return true;
}

template< typename Real,
@@ -335,13 +387,11 @@ template< typename Real,
Real tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::getElementFast( const IndexType row,
                                                                               const IndexType column ) const
{
   const IndexType sliceIdx = row / SliceSize;
   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
   IndexType elementPtr = this->slicePointers[ sliceIdx ] +
                          rowLength * ( row - sliceIdx * SliceSize );
   const IndexType rowEnd = elementPtr + rowLength;
   Index elementPtr, rowEnd, step;
   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );

   while( elementPtr < rowEnd && this->columnIndexes[ elementPtr ] < column )
      elementPtr++;
      elementPtr += step;
   if( elementPtr < rowEnd && this->columnIndexes[ elementPtr ] == column )
      return this->values[ elementPtr ];
   return 0.0;
@@ -354,7 +404,15 @@ template< typename Real,
Real tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::getElement( const IndexType row,
                                                                           const IndexType column ) const
{
   return this->getElementFast( row, column );

   Index elementPtr, rowEnd, step;
   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );

   while( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) < column )
      elementPtr += step;
   if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column )
      return this->values.getElement( elementPtr );
   return 0.0;
}

template< typename Real,
@@ -368,15 +426,15 @@ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::getRowFast( const
                                                                           IndexType* columns,
                                                                           RealType* values ) const
{
   const IndexType sliceIdx = row / SliceSize;
   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
   IndexType elementPtr = this->slicePointers[ sliceIdx ] +
                          rowLength * ( row - sliceIdx * SliceSize );
   for( IndexType i = 0; i < rowLength; i++ )
   Index elementPtr, rowEnd, step, i( 0 );
   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );

   while( elementPtr < rowEnd )
   {
      columns[ i ] = this->columnIndexes[ elementPtr ];
      values[ i ] = this->values[ elementPtr ];
      elementPtr++;
      elementPtr += step;
      i++;
   }
}

@@ -388,33 +446,55 @@ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::getRow( const Ind
                                                                       IndexType* columns,
                                                                       RealType* values ) const
{
   return this->getRowFast( row, columns, values );
}
   Index elementPtr, rowEnd, step, i( 0 );
   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );

   while( elementPtr < rowEnd )
   {
      columns[ i ] = this->columnIndexes.getElement( elementPtr );
      values[ i ] = this->values.getElement( elementPtr );
      elementPtr += step;
      i++;
   }
}

template< typename Real,
          typename Device,
          typename Index,
          int SliceSize >
  template< typename Vector >
void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::vectorProduct( const Vector& inVector,
                                                                              Vector& outVector ) const
{
   for( Index row = 0; row < this->getRows(); row ++ )
#ifdef HAVE_CUDA
   __device__ __host__
#endif
typename Vector::RealType tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::rowVectorProduct( const IndexType row,
                                                                                                      const Vector& vector ) const
{
   Real result = 0.0;
      const IndexType sliceIdx = row / SliceSize;
      const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
      IndexType elementPtr = this->slicePointers[ sliceIdx ] +
                             rowLength * ( row - sliceIdx * SliceSize );
      const IndexType rowEnd( elementPtr + rowLength );
   Index elementPtr, rowEnd, step;
   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );

   while( elementPtr < rowEnd && this->columnIndexes[ elementPtr ] < this->columns )
   {
         const Index column = this->columnIndexes.getElement( elementPtr );
         result += this->values.getElement( elementPtr++ ) * inVector.getElement( column );
      const Index column = this->columnIndexes[ elementPtr ];
      result += this->values[ elementPtr ] * vector[ column ];
      elementPtr += step;
   }
      outVector.setElement( row, result );
   return result;
}

template< typename Real,
          typename Device,
          typename Index,
          int SliceSize >
   template< typename Vector >
void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::vectorProduct( const Vector& inVector,
                                                                              Vector& outVector ) const
{
   if( DeviceType::getDevice() == tnlHostDevice )
      for( Index row = 0; row < this->getRows(); row ++ )
         outVector[ row ] = this->rowVectorProduct( row, inVector);
   if( DeviceType::getDevice() == tnlCudaDevice )
      tnlMatrixVectorProductCuda( *this, inVector, outVector );
}

template< typename Real,
@@ -562,29 +642,40 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
      template< typename Real,
                typename Index,
                int SliceSize >
      static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                const Index row )
      static void initRowTraverse( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                   const Index row,
                                   Index& rowBegin,
                                   Index& rowEnd,
                                   Index& step )
      {
         return row * matrix.rowLengths;
      }
         const Index sliceIdx = row / SliceSize;
         const Index slicePointer = matrix.slicePointers.getElement( sliceIdx );
         const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx );

      template< typename Real,
                typename Index,
                int SliceSize >
      static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                const Index row )
      {
         return ( row + 1 ) * matrix.rowLengths;
         rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
         rowEnd = rowBegin + rowLength;
         step = 1;
      }

      template< typename Real,
                typename Index,
                int SliceSize >
      static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix )
      static void initRowTraverseFast( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                       const Index row,
                                       Index& rowBegin,
                                       Index& rowEnd,
                                       Index& step )
      {
         return 1;
         const Index sliceIdx = row / SliceSize;
         const Index slicePointer = matrix.slicePointers[ sliceIdx ];
         const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];

         rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
         rowEnd = rowBegin + rowLength;
         step = 1;
      }


      template< typename Real,
                typename Index,
                int SliceSize >
@@ -607,6 +698,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
            matrix.sliceRowLengths.setElement( slice, sliceRowLength );
            matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
         }
         matrix.slicePointers.setElement( matrix.slicePointers.getSize() - 1, 0 );
      }
};

@@ -622,16 +714,18 @@ __global__ void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKerne
   Index rowIdx = sliceIdx * SliceSize;
   Index rowInSliceIdx( 0 );
   Index maxRowLength( 0 );
   if( rowIdx >= matrix->getRows() )
      return;
   while( rowInSliceIdx < SliceSize && rowIdx < matrix->getRows() )
   {
      printf( "sliceIdx = %d rowInSliceIdx = %d SliceSize = %d rowIdx = %d matrix->getRows() = %d \n", sliceIdx, rowInSliceIdx, SliceSize, rowIdx, matrix->getRows() );
      maxRowLength = Max( maxRowLength, rowLengths[ rowIdx ] );
      printf( "threadIdx.x = %d, maxRowLength = %d \n", threadIdx.x, maxRowLength );
      maxRowLength = Max( maxRowLength, ( *rowLengths )[ rowIdx ] );
      rowIdx++;
      rowInSliceIdx++;
   }
   //matrix->sliceRowLengths[ sliceIdx ] = maxRowLength;
   //matrix->slicePointers[ sliceIdx ] = maxRowLength * SliceSize;
   matrix->sliceRowLengths[ sliceIdx ] = maxRowLength;
   matrix->slicePointers[ sliceIdx ] = maxRowLength * SliceSize;
   if( threadIdx.x == 0 )
      matrix->slicePointers[ matrix->slicePointers.getSize() - 1 ] = 0;
}
#endif

@@ -645,25 +739,19 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
      template< typename Real,
                typename Index,
                int SliceSize >
#ifdef HAVE_CUDA
      __device__ __host__
#endif
      static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                const Index row )
      static void initRowTraverse( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                   const Index row,
                                   Index& rowBegin,
                                   Index& rowEnd,
                                   Index& step )
      {
         return row * matrix.rowLengths;
      }
         const Index sliceIdx = row / SliceSize;
         const Index slicePointer = matrix.slicePointers.getElement( sliceIdx );
         const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx );

      template< typename Real,
                typename Index,
                int SliceSize >
#ifdef HAVE_CUDA
      __device__ __host__
#endif
      static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                const Index row )
      {
         return ( row + 1 ) * matrix.rowLengths;
         rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
         rowEnd = rowBegin + rowLength;
         step = 1;
      }

      template< typename Real,
@@ -672,9 +760,19 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
#ifdef HAVE_CUDA
      __device__ __host__
#endif
      static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix )
      static void initRowTraverseFast( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                       const Index row,
                                       Index& rowBegin,
                                       Index& rowEnd,
                                       Index& step )
      {
         return 1;
         const Index sliceIdx = row / SliceSize;
         const Index slicePointer = matrix.slicePointers[ sliceIdx ];
         const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];

         rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
         rowEnd = rowBegin + rowLength;
         step = 1;
      }

      template< typename Real,
@@ -692,7 +790,6 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
         dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
         const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
         const Index cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
         cout << rowLengths << endl;
         for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
         {
            if( gridIdx == cudaGrids - 1 )
@@ -705,8 +802,6 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
         tnlCuda::freeFromDevice( kernel_matrix );
         tnlCuda::freeFromDevice( kernel_rowLengths );
         checkCudaDevice;
         cout << rowLengths << endl;
         cout << matrix.slicePointers << endl << matrix.sliceRowLengths << endl;
#endif
      }
};
+44 −14
Original line number Diff line number Diff line
@@ -51,8 +51,6 @@ __global__ void tnlSparseMatrixTester__setElementFast_LowerTriangularMatrixTestC
template< typename MatrixType >
__global__ void tnlSparseMatrixTester__setElementFast_LowerTriangularMatrixTestCudaKernel2( MatrixType* matrix,
                                                                                            bool* testResult );


#endif


@@ -92,6 +90,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
      suiteOfTests->addTest( new TestCallerType( "setElementFast_DenseMatrixTest", &TesterType::setElementFast_DenseMatrixTest ) );
      suiteOfTests->addTest( new TestCallerType( "setElement_LowerTriangularMatrixTest", &TesterType::setElement_LowerTriangularMatrixTest ) );
      suiteOfTests->addTest( new TestCallerType( "setElementFast_LowerTriangularMatrixTest", &TesterType::setElementFast_LowerTriangularMatrixTest ) );
      suiteOfTests->addTest( new TestCallerType( "setRow_DiagonalMatrixTest", &TesterType::setRow_DiagonalMatrixTest ) );
      suiteOfTests->addTest( new TestCallerType( "addElementTest", &TesterType::addElementTest ) );
      suiteOfTests->addTest( new TestCallerType( "vectorProductTest", &TesterType::vectorProductTest ) );
      /*suiteOfTests -> addTest( new TestCallerType( "matrixTranspositionTest", &TesterType::matrixTranspositionTest ) );
@@ -497,6 +496,37 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
                  CPPUNIT_ASSERT( m.getElement( i, j ) == 0 );
   }

   void setRow_DiagonalMatrixTest()
   {
      MatrixType m;
      m.setDimensions( 10, 10 );
      IndexVector rowLengths;
      rowLengths.setSize( m.getRows() );
      rowLengths.setValue( 7 );
      m.setRowLengths( rowLengths );
      RealType values[ 1 ];
      IndexType columnIndexes[ 1 ];

      for( int i = 0; i < 10; i++ )
      {
         values[ 0 ] = i;
         columnIndexes[ 0 ] = i;
         m.setRow( i, columnIndexes, values, 1 );
      }

      for( int i = 0; i < 10; i++ )
      {
         for( int j = 0; j < 10; j++ )
         {
            if( i == j )
               CPPUNIT_ASSERT( m.getElement( i, j ) == i );
            else
               CPPUNIT_ASSERT( m.getElement( i, j ) == 0 );
         }
      }
   }


   void vectorProductTest()
   {
      const int size = 10;