Commit 0fb43230 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing the Chunked Ellpack format in CUDA.

parent 83d48654
Loading
Loading
Loading
Loading
+449 −102

File changed.

Preview size limit exceeded, changes collapsed.

+7 −0
Original line number Diff line number Diff line
@@ -111,5 +111,12 @@ bool tnlSparseMatrix< Real, Device, Index >::allocateMatrixElements( const Index
   return true;
}

template< typename Real,
          typename Device,
          typename Index >
void tnlSparseMatrix< Real, Device, Index >::printStructure( ostream& str ) const
{

}

#endif /* TNLSPARSEMATRIX_IMPL_H_ */
+79 −12
Original line number Diff line number Diff line
@@ -42,6 +42,16 @@ struct tnlChunkedEllpackSliceInfo
   { return tnlString( "tnlChunkedEllpackSliceInfo" ); };
};

#ifdef HAVE_CUDA
template< typename Real,
          typename Index,
          typename Vector >
__global__ void tnlChunkedEllpackMatrixVectorProductCudaKernel( const tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
                                                                const Vector* inVector,
                                                                Vector* outVector,
                                                                int gridIdx );
#endif

template< typename Real, typename Device, typename Index >
class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
{
@@ -85,6 +95,11 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >

   IndexType getDesiredChunkSize() const;

#ifdef HAVE_CUDA
   __device__ __host__
#endif
   IndexType getNumberOfSlices() const;

#ifdef HAVE_CUDA
   __device__ __host__
#endif
@@ -165,6 +180,14 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
   typename Vector::RealType rowVectorProduct( const IndexType row,
                                               const Vector& vector ) const;

#ifdef HAVE_CUDA
   template< typename Vector >
   __device__ void computeSliceVectorProduct( const Vector* inVector,
                                              Vector* outVector,
                                              int gridIdx  ) const;
#endif


   template< typename Vector >
   void vectorProduct( const Vector& inVector,
                       Vector& outVector ) const;
@@ -194,11 +217,12 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >

   void print( ostream& str ) const;

   void printStructure( ostream& str ) const;

   protected:


   void resolveSliceSizes( const tnlVector< Index, tnlHost, Index >& rowLengths,
                           IndexType& numberOfSlices );
   void resolveSliceSizes( const tnlVector< Index, tnlHost, Index >& rowLengths );

   bool setSlice( const RowLengthsVector& rowLengths,
                  const IndexType sliceIdx,
@@ -211,6 +235,9 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                           RealType& value,
                           RealType& thisElementMultiplicator );

#ifdef HAVE_CUDA
   __device__ __host__
#endif
   bool addElementToChunkFast( const IndexType sliceOffset,
                               const IndexType chunkIndex,
                               const IndexType chunkSize,
@@ -218,18 +245,60 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                               RealType& value,
                               RealType& thisElementMultiplicator );

#ifdef HAVE_CUDA
   __device__ __host__
#endif
   void setChunkFast( const IndexType sliceOffset,
                      const IndexType chunkIndex,
                      const IndexType chunkSize,
                      const IndexType* columnIndexes,
                      const RealType* values,
                      const IndexType elements );

   void setChunk( const IndexType sliceOffset,
                  const IndexType chunkIndex,
                  const IndexType chunkSize,
                  const IndexType* columnIndexes,
                  const RealType* values,
                  const IndexType elements );

   bool getElementInChunk( const IndexType sliceOffset,
                           const IndexType chunkIndex,
                           const IndexType chunkSize,
                           const IndexType column,
                           RealType& value ) const;

#ifdef HAVE_CUDA
   __device__ __host__
#endif
   bool getElementInChunkFast( const IndexType sliceOffset,
                               const IndexType chunkIndex,
                               const IndexType chunkSize,
                               const IndexType column,
                               RealType& value ) const;

   void getChunk( const IndexType sliceOffset,
                  const IndexType chunkIndex,
                  const IndexType chunkSize,
                  IndexType* columns,
                  RealType* values ) const;

#ifdef HAVE_CUDA
   __device__ __host__
#endif
   void getChunkFast( const IndexType sliceOffset,
                      const IndexType chunkIndex,
                      const IndexType chunkSize,
                      IndexType* columns,
                      RealType* values ) const;

   template< typename Vector >
   typename Vector::RealType chunkVectorProduct( const IndexType sliceOffset,
                                                 const IndexType chunkIndex,
                                                 const IndexType chunkSize,
                                                 const Vector& vector ) const;



   IndexType chunksInSlice, desiredChunkSize;

@@ -237,22 +306,20 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >

   tnlArray< ChunkedEllpackSliceInfo, Device, Index > slices;

   //IndexType numberOfSlices;
   IndexType numberOfSlices;

   typedef tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
   friend class tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType >;
   friend class tnlChunkedEllpackMatrix< RealType, tnlHost, IndexType >;
   friend class tnlChunkedEllpackMatrix< RealType, tnlCuda, IndexType >;

/*#ifdef HAVE_CUDA
   friend void tnlChunkedEllpackMatrix_setSlices_CudaKernel< Real, Index, 256 >( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
                                                                                 const RowLengthsVector* rowLengths,
                                                                                 const Index numberOfSlices,
                                                                                 Index* elementsToAllocation,
                                                                                 const Index gridIdx );
#endif*/


#ifdef HAVE_CUDA
   template< typename Vector >
   friend void tnlChunkedEllpackMatrixVectorProductCudaKernel( const tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
                                                               const Vector* inVector,
                                                               Vector* outVector,
                                                               int gridIdx );
#endif
};

#include <implementation/matrices/tnlChunkedEllpackMatrix_impl.h>
+3 −0
Original line number Diff line number Diff line
@@ -49,10 +49,13 @@ class tnlSparseMatrix : public tnlMatrix< Real, Device, Index >

   bool load( tnlFile& file );

   void printStructure( ostream& str ) const;

   protected:

   bool allocateMatrixElements( const IndexType& numberOfMatrixElements );


   tnlVector< Index, Device, Index > columnIndexes;
};

+1 −1
Original line number Diff line number Diff line
@@ -27,7 +27,7 @@
int main( int argc, char* argv[] )
{
   tnlSparseMatrixTester< tnlChunkedEllpackMatrix< float, tnlHost, int >, tnlChunkedEllpackMatrixTestSetup< 4, 2 > > tester;
   //tester.setElementTest();
   //tester. vectorProduct_DiagonalMatrixTest();

#ifdef HAVE_CPPUNIT
   if( ! tnlUnitTestStarter :: run< tnlSparseMatrixTester< tnlChunkedEllpackMatrix< float, tnlHost, int >, tnlChunkedEllpackMatrixTestSetup< 4, 2 > > >() ||
Loading