Loading src/core/mfuncs.h +23 −4 Original line number Diff line number Diff line Loading @@ -21,24 +21,40 @@ #include <math.h> #include <stdlib.h> template< typename Type1, typename Type2 > Type1 Min( const Type1& a, const Type2& b ) template< typename Type1, typename Type2 > #ifdef HAVE_CUDA __device__ __host__ #endif Type1 Min( const Type1& a, const Type2& b ) { return a < b ? a : b; }; template< typename Type1, typename Type2 > Type1 Max( const Type1& a, const Type2& b ) template< typename Type1, typename Type2 > #ifdef HAVE_CUDA __device__ __host__ #endif Type1 Max( const Type1& a, const Type2& b ) { return a > b ? a : b; }; template< typename Type > void Swap( Type& a, Type& b ) template< typename Type > #ifdef HAVE_CUDA __device__ __host__ #endif void Swap( Type& a, Type& b ) { Type tmp( a ); a = b; b = tmp; }; template< class T > T Sign( const T& a ) template< class T > #ifdef HAVE_CUDA __device__ __host__ #endif T Sign( const T& a ) { if( a < ( T ) 0 ) return -1; if( a == ( T ) 0 ) return 0; Loading @@ -46,6 +62,9 @@ template< class T > T Sign( const T& a ) }; template< class T > #ifdef HAVE_CUDA __device__ __host__ #endif T tnlAbs( const T& n ) { if( n < ( T ) 0 ) Loading src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h +17 −1 Original line number Diff line number Diff line Loading @@ -618,7 +618,20 @@ __global__ void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKerne const typename tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >::RowLengthsVector* rowLengths, int gridIdx ) { const Index sliceIdx = gridIdx * tnlCuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; Index rowIdx = sliceIdx * SliceSize; Index rowInSliceIdx( 0 ); Index maxRowLength( 0 ); while( rowInSliceIdx < SliceSize && rowIdx < matrix->getRows() ) { printf( "sliceIdx = %d rowInSliceIdx = %d SliceSize = %d rowIdx = %d matrix->getRows() = %d \n", sliceIdx, rowInSliceIdx, SliceSize, rowIdx, matrix->getRows() ); maxRowLength = Max( maxRowLength, rowLengths[ rowIdx ] ); printf( "threadIdx.x = %d, maxRowLength = %d \n", threadIdx.x, maxRowLength ); rowIdx++; rowInSliceIdx++; } //matrix->sliceRowLengths[ sliceIdx ] = maxRowLength; //matrix->slicePointers[ sliceIdx ] = maxRowLength * SliceSize; } #endif Loading Loading @@ -679,6 +692,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); const Index cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); cout << rowLengths << endl; for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) Loading @@ -691,6 +705,8 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > tnlCuda::freeFromDevice( kernel_matrix ); tnlCuda::freeFromDevice( kernel_rowLengths ); checkCudaDevice; cout << rowLengths << endl; cout << matrix.slicePointers << endl << matrix.sliceRowLengths << endl; #endif } }; Loading src/matrices/tnlSlicedEllpackMatrix.h +21 −0 Original line number Diff line number Diff line Loading @@ -28,6 +28,21 @@ template< typename Real = double, typename Device = tnlHost, typename Index = int, int SliceSize = 32 > class tnlSlicedEllpackMatrix; #ifdef HAVE_CUDA template< typename Real, typename Index, int SliceSize > __global__ void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKernel( tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >* matrix, const typename tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >::RowLengthsVector* rowLengths, int gridIdx ); #endif template< typename Real, typename Device, typename Index, int SliceSize > class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > { public: Loading Loading @@ -178,6 +193,12 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > typedef tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode; friend class tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType >; #ifdef HAVE_CUDA friend void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize >( tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >* matrix, const typename tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >::RowLengthsVector* rowLengths, int gridIdx ); #endif }; Loading Loading
src/core/mfuncs.h +23 −4 Original line number Diff line number Diff line Loading @@ -21,24 +21,40 @@ #include <math.h> #include <stdlib.h> template< typename Type1, typename Type2 > Type1 Min( const Type1& a, const Type2& b ) template< typename Type1, typename Type2 > #ifdef HAVE_CUDA __device__ __host__ #endif Type1 Min( const Type1& a, const Type2& b ) { return a < b ? a : b; }; template< typename Type1, typename Type2 > Type1 Max( const Type1& a, const Type2& b ) template< typename Type1, typename Type2 > #ifdef HAVE_CUDA __device__ __host__ #endif Type1 Max( const Type1& a, const Type2& b ) { return a > b ? a : b; }; template< typename Type > void Swap( Type& a, Type& b ) template< typename Type > #ifdef HAVE_CUDA __device__ __host__ #endif void Swap( Type& a, Type& b ) { Type tmp( a ); a = b; b = tmp; }; template< class T > T Sign( const T& a ) template< class T > #ifdef HAVE_CUDA __device__ __host__ #endif T Sign( const T& a ) { if( a < ( T ) 0 ) return -1; if( a == ( T ) 0 ) return 0; Loading @@ -46,6 +62,9 @@ template< class T > T Sign( const T& a ) }; template< class T > #ifdef HAVE_CUDA __device__ __host__ #endif T tnlAbs( const T& n ) { if( n < ( T ) 0 ) Loading
src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h +17 −1 Original line number Diff line number Diff line Loading @@ -618,7 +618,20 @@ __global__ void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKerne const typename tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >::RowLengthsVector* rowLengths, int gridIdx ) { const Index sliceIdx = gridIdx * tnlCuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; Index rowIdx = sliceIdx * SliceSize; Index rowInSliceIdx( 0 ); Index maxRowLength( 0 ); while( rowInSliceIdx < SliceSize && rowIdx < matrix->getRows() ) { printf( "sliceIdx = %d rowInSliceIdx = %d SliceSize = %d rowIdx = %d matrix->getRows() = %d \n", sliceIdx, rowInSliceIdx, SliceSize, rowIdx, matrix->getRows() ); maxRowLength = Max( maxRowLength, rowLengths[ rowIdx ] ); printf( "threadIdx.x = %d, maxRowLength = %d \n", threadIdx.x, maxRowLength ); rowIdx++; rowInSliceIdx++; } //matrix->sliceRowLengths[ sliceIdx ] = maxRowLength; //matrix->slicePointers[ sliceIdx ] = maxRowLength * SliceSize; } #endif Loading Loading @@ -679,6 +692,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); const Index cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); cout << rowLengths << endl; for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) Loading @@ -691,6 +705,8 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > tnlCuda::freeFromDevice( kernel_matrix ); tnlCuda::freeFromDevice( kernel_rowLengths ); checkCudaDevice; cout << rowLengths << endl; cout << matrix.slicePointers << endl << matrix.sliceRowLengths << endl; #endif } }; Loading
src/matrices/tnlSlicedEllpackMatrix.h +21 −0 Original line number Diff line number Diff line Loading @@ -28,6 +28,21 @@ template< typename Real = double, typename Device = tnlHost, typename Index = int, int SliceSize = 32 > class tnlSlicedEllpackMatrix; #ifdef HAVE_CUDA template< typename Real, typename Index, int SliceSize > __global__ void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKernel( tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >* matrix, const typename tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >::RowLengthsVector* rowLengths, int gridIdx ); #endif template< typename Real, typename Device, typename Index, int SliceSize > class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > { public: Loading Loading @@ -178,6 +193,12 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > typedef tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode; friend class tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType >; #ifdef HAVE_CUDA friend void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize >( tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >* matrix, const typename tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >::RowLengthsVector* rowLengths, int gridIdx ); #endif }; Loading