From 5c960389ff45de4e2065a53afa38795bea5e6a0b Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Tue, 7 Apr 2020 15:59:38 +0200 Subject: [PATCH 01/57] Problem with function --- src/TNL/Matrices/Legacy/CSR_impl.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 6990d4072..8385aada7 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -1038,6 +1038,27 @@ class CSRDeviceDependentCode< Devices::Host > }; #ifdef HAVE_CUDA + +template< typename Real, + typename Index, + CSRKernel KernelType, + typename InVector, + typename OutVector, + int warpSize > +__global__ void CSRScalarGlobal( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix, + const InVector* inVector, + OutVector* outVector, + int gridIdx, + int *blocks, size_t size) +{ + const auto columns = matrix->getColumns(); // funguje + + // nefunguje + const auto &rowPointers = matrix->getRowPointers(); + const auto &columnIndexes = matrix->getColumnIndexes(); + const auto &values = matrix->getValues(); +} + template< typename Real, typename Index, CSRKernel KernelType, @@ -1099,8 +1120,13 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& //const int sharedMemory = cudaBlockSize.x * sizeof( Real ); //const int threads = cudaBlockSize.x; if( matrix.getCudaWarpSize() == 32 ) { - // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads); - CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 > + // CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 > + // <<< 2, 1024 >>> + // ( kernel_this, + // kernel_inVector, + // kernel_outVector, + // gridIdx, kernelBlocks, size ); + CSRScalarGlobal< Real, Index, KernelType, InVector, OutVector, 32 > <<< 2, 1024 >>> ( kernel_this, kernel_inVector, -- GitLab From 712abfc8a6b975184d15581bed3f373b67faac5c Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sun, 12 Apr 2020 20:34:41 +0200 Subject: [PATCH 02/57] Changes to CSR SpMV functions --- src/TNL/Matrices/Legacy/CSR_impl.h | 396 +++++++++++++++++++---------- 1 file changed, 267 insertions(+), 129 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 8385aada7..ff8e57571 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -777,36 +777,6 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& } } -/* template< typename Real, - typename Device, - typename Index, - typename InVector, - int warpSize > -__global__ -void spmvCSRVectorHelper( const InVector& inVector, - Real *out, - size_t from, - size_t to, - size_t perWarp) -{ - const size_t index = blockIdx.x * blockDim.x + threadIdx.x; - const size_t warpID = index / warpSize; - const size_t laneID = index % warpSize; - const size_t minID = from + warpID * perWarp; - size_t maxID = from + (warpID + 1) * perWarp; - if (minID >= to) return; - if (maxID >= to ) maxID = to; - - Real result = 0.0; - for (IndexType i = minID + laneID; i < maxID; i += warpSize) { - const IndexType column = this->columnIndexes[i]; - if (column >= this->getColumns()) - continue; - result += this->values[i] * inVector[column]; - } - atomicAdd(out, result); -} */ - template< typename Real, typename Device, typename Index, @@ -894,6 +864,138 @@ void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& in } } +// __global__ +// void spmvCSRVectorHelper() { + +// } + +template< typename Real, + typename Index, + typename InVector, + int warpSize > +__global__ +void spmvCSRVectorHelper( const InVector& inVector, + const int* columnIndexes, + const float *values, + const int getColumns, + Real *out, + size_t from, + size_t to, + size_t perWarp) +{ + const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t warpID = index / warpSize; + const size_t laneID = index % warpSize; + const size_t minID = from + warpID * perWarp; + size_t maxID = from + (warpID + 1) * perWarp; + if (minID >= to) return; + if (maxID >= to ) maxID = to; + + Real result = 0.0; + for (size_t i = minID + laneID; i < maxID; i += warpSize) { + const size_t column = columnIndexes[i]; + if (column >= getColumns) + continue; + result += values[i] * inVector[column]; + } + atomicAdd(out, result); +} + +template< typename Real, + typename Index, + typename InVector, + typename OutVector, + int warpSize > +__global__ +void SpMVCSRAdaptiveGlobal( const InVector& inVector, + OutVector& outVector, + const int* rowPointers, + const int* columnIndexes, + const float* values, + int *blocks, + size_t blocks_size, + Index getColumns + ) +{ + /* Configuration ---------------------------------------------------*/ + constexpr size_t SHARED = 49152/sizeof(float); + constexpr size_t SHARED_PER_WARP = SHARED / warpSize; + constexpr size_t MAX_PER_WARP = 65536; + constexpr size_t ELEMENTS_PER_WARP = 1024; + constexpr size_t THREADS_PER_BLOCK = 1024; + constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize; + //-------------------------------------------------------------------- + const Index index = blockIdx.x * blockDim.x + threadIdx.x; + const Index laneID = index % warpSize; + const Index blockIdx = index / warpSize; + __shared__ float shared_res[SHARED]; + float result = 0.0; + if (blockIdx >= blocks_size - 1) + return; + const Index minRow = blocks[blockIdx]; + const Index maxRow = blocks[blockIdx + 1]; + const Index minID = rowPointers[minRow]; + const Index maxID = rowPointers[maxRow]; + const Index elements = maxID - minID; + /* rows per block more than 1 */ + if ((maxRow - minRow) > 1) { + /////////////////////////////////////* CSR STREAM *////////////// + /* Copy and calculate elements from global to shared memory, coalesced */ + const Index offset = threadIdx.x / warpSize * SHARED_PER_WARP; + for (Index i = laneID; i < elements; i += warpSize) { + const Index elementIdx = i + minID; + const Index column = columnIndexes[elementIdx]; + if (column >= getColumns) + continue; + + shared_res[i + offset] = values[elementIdx] * inVector[column]; + } + + const Index row = minRow + laneID; + if (row >= maxRow) + return; + /* Calculate result */ + const Index to = rowPointers[row + 1] - minID; + for (Index i = rowPointers[row] - minID; i < to; ++i) { + result += shared_res[i + offset]; + } + outVector[row] = result; // Write result + } + else if (elements <= MAX_PER_WARP) { + /////////////////////////////////////* CSR VECTOR *////////////// + for (Index i = minID + laneID; i < maxID; i += warpSize) { + Index column = columnIndexes[i]; + if (column >= getColumns) + break; + + result += values[i] * inVector[column]; + } + /* Reduction */ + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); + if (laneID == 0) outVector[minRow] = result; // Write result + } + else { + /////////////////////////////////////* CSR VECTOR LONG *////////////// + const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1; + const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1; + const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize; + spmvCSRVectorHelper <<>>( + inVector, + columnIndexes, + values, + getColumns, + &outVector[minRow], + (size_t)(minID + ELEMENTS_PER_WARP), + (size_t)maxID, + (size_t)ELEMENTS_PER_WARP + ); + } +} + template< typename Real, typename Device, @@ -951,8 +1053,7 @@ template< typename Real, __device__ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& inVector, OutVector& outVector, - int gridIdx, - int *blocks, size_t size ) const + int gridIdx ) const { switch( KernelType ) { @@ -966,7 +1067,9 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); break; case CSRAdaptive: - spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); + // spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); + /* FIXME */ + spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); break; case CSRStream: // TODO: @@ -1039,25 +1142,16 @@ class CSRDeviceDependentCode< Devices::Host > #ifdef HAVE_CUDA -template< typename Real, - typename Index, - CSRKernel KernelType, - typename InVector, - typename OutVector, - int warpSize > -__global__ void CSRScalarGlobal( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix, - const InVector* inVector, - OutVector* outVector, - int gridIdx, - int *blocks, size_t size) -{ - const auto columns = matrix->getColumns(); // funguje - - // nefunguje - const auto &rowPointers = matrix->getRowPointers(); - const auto &columnIndexes = matrix->getColumnIndexes(); - const auto &values = matrix->getValues(); -} +// template< typename Real, +// typename Index, +// CSRKernel KernelType, +// typename InVector, +// typename OutVector, +// int warpSize > +// __global__ +// void CSRScalarGlobal(const Containers::Vector< Index, Devices::Cuda, Index, Allocators::Cuda >* row) +// { +// } template< typename Real, typename Index, @@ -1066,10 +1160,9 @@ template< typename Real, typename OutVector, int warpSize > __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix, - const InVector* inVector, - OutVector* outVector, - int gridIdx, - int *blocks, size_t size) + const InVector* inVector, + OutVector* outVector, + int gridIdx) { typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); @@ -1082,7 +1175,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde else { matrix->template vectorProductCuda< InVector, OutVector, warpSize > - ( *inVector, *outVector, gridIdx, blocks, size ); + ( *inVector, *outVector, gridIdx ); } } #endif @@ -1094,9 +1187,7 @@ template< typename Real, typename OutVector > void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix, const InVector& inVector, - OutVector& outVector, - int *blocks, - size_t size ) + OutVector& outVector) { #ifdef HAVE_CUDA typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix; @@ -1104,10 +1195,6 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - int *kernelBlocks; - cudaMalloc((void **)&kernelBlocks, sizeof(int) * size); - cudaMemcpy(kernelBlocks, blocks, size * sizeof(int), cudaMemcpyHostToDevice); - TNL_CHECK_CUDA_DEVICE; dim3 cudaBlockSize( 256 ); //dim3 cudaGridSize( Cuda::getMaxGridSize() ); @@ -1131,44 +1218,42 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& ( kernel_this, kernel_inVector, kernel_outVector, - gridIdx, kernelBlocks, size ); - } - // if( matrix.getCudaWarpSize() == 16 ) - // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > - // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - // ( kernel_this, - // kernel_inVector, - // kernel_outVector, - // gridIdx, kernelBlocks, size ); - // if( matrix.getCudaWarpSize() == 8 ) - // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > - // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - // ( kernel_this, - // kernel_inVector, - // kernel_outVector, - // gridIdx, kernelBlocks, size ); - // if( matrix.getCudaWarpSize() == 4 ) - // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > - // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - // ( kernel_this, - // kernel_inVector, - // kernel_outVector, - // gridIdx, kernelBlocks, size ); - // if( matrix.getCudaWarpSize() == 2 ) - // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > - // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - // ( kernel_this, - // kernel_inVector, - // kernel_outVector, - // gridIdx, kernelBlocks, size ); - // if( matrix.getCudaWarpSize() == 1 ) - // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > - // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - // ( kernel_this, - // kernel_inVector, - // kernel_outVector, - // gridIdx, kernelBlocks, size ); - + gridIdx ); + if( matrix.getCudaWarpSize() == 16 ) + CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 16 > + <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( kernel_this, + kernel_inVector, + kernel_outVector, + gridIdx); + if( matrix.getCudaWarpSize() == 8 ) + CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 8 > + <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( kernel_this, + kernel_inVector, + kernel_outVector, + gridIdx); + if( matrix.getCudaWarpSize() == 4 ) + CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 4 > + <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( kernel_this, + kernel_inVector, + kernel_outVector, + gridIdx); + if( matrix.getCudaWarpSize() == 2 ) + CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 2 > + <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( kernel_this, + kernel_inVector, + kernel_outVector, + gridIdx); + if( matrix.getCudaWarpSize() == 1 ) + CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 1 > + <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( kernel_this, + kernel_inVector, + kernel_outVector, + gridIdx); } TNL_CHECK_CUDA_DEVICE; Cuda::freeFromDevice( kernel_this ); @@ -1296,36 +1381,89 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - constexpr int SHARED = 49152/sizeof(float); - constexpr int SHARED_PER_WARP = SHARED / 32; - std::vector inBlock; - inBlock.push_back(0); - size_t sum = 0; - Index i; - int prev_i = 0; - for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { - size_t elements = matrix.getRowPointers().getElement(i) - - matrix.getRowPointers().getElement(i - 1); - sum += elements; - if (sum > SHARED_PER_WARP) { - if (i - prev_i == 1) { + // #ifdef HAVE_CUDA + // if (KernelType == CSRAdaptive) { + if (sizeof(Index) != 4) { + printf("Size of Index type is too small!\n"); + return; + } + + constexpr int SHARED = 49152/sizeof(float); + constexpr int SHARED_PER_WARP = SHARED / 32; + std::vector inBlock; + inBlock.push_back(0); + size_t sum = 0; + Index i; + int prev_i = 0; + for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { + size_t elements = matrix.getRowPointers().getElement(i) - + matrix.getRowPointers().getElement(i - 1); + sum += elements; + if (sum > SHARED_PER_WARP) { + if (i - prev_i == 1) { + inBlock.push_back(i); + } else { + inBlock.push_back(i - 1); + --i; + } + sum = 0; + prev_i = i; + continue; + } + if (i - prev_i == 32) { inBlock.push_back(i); - } else { - inBlock.push_back(i - 1); - --i; + prev_i = i; + sum = 0; } - sum = 0; - prev_i = i; - continue; - } - if (i - prev_i == 32) { - inBlock.push_back(i); - prev_i = i; - sum = 0; } - } - inBlock.push_back(matrix.getRowPointers().getSize() - 1); - CSRVectorProductCuda( matrix, inVector, outVector, inBlock.data(), inBlock.size() ); + inBlock.push_back(matrix.getRowPointers().getSize() - 1); + + const InVector *kernelInVector = Cuda::passToDevice( inVector ); + OutVector *kernelOutVector = Cuda::passToDevice( outVector ); + CSR< Real, Device, Index, KernelType >* kernel_this = Cuda::passToDevice( matrix ); + + /* blocks */ + int *kernelBlocks; + cudaMalloc((void **)&kernelBlocks, sizeof(int) * inBlock.size()); + cudaMemcpy(kernelBlocks, inBlock.data(), inBlock.size() * sizeof(int), cudaMemcpyHostToDevice); + + /* values */ + float *kernel_values; + cudaMalloc((void **)&kernel_values, sizeof(float) * matrix.getValues().getSize()); + cudaMemcpy(kernel_values, + (float *)matrix.getValues().getData(), + matrix.getValues().getSize() * sizeof(float), + cudaMemcpyHostToDevice); + + /* columns */ + int *kernel_columns; + cudaMalloc((void **)&kernel_columns, sizeof(int) * matrix.getColumnIndexes().getSize()); + cudaMemcpy(kernel_columns, + (int *)matrix.getColumnIndexes().getData(), + matrix.getColumnIndexes().getSize() * sizeof(int), + cudaMemcpyHostToDevice); + + /* row pointers */ + int *kernel_rowPointers; + cudaMalloc((void **)&kernel_rowPointers, sizeof(int) * matrix.getRowPointers().getSize()); + cudaMemcpy(kernel_rowPointers, + (int *)matrix.getRowPointers().getData(), + matrix.getRowPointers().getSize() * sizeof(int), + cudaMemcpyHostToDevice); + + SpMVCSRAdaptiveGlobal< Real, Index, InVector, OutVector, 32 ><<<2, 1024>>>( + *kernelInVector, + *kernelOutVector, + kernel_rowPointers, + kernel_columns, + kernel_values, + kernelBlocks, + inBlock.size(), + matrix.getColumns() + ); + // } else + // #endif /* HAVE_CUDA */ + // CSRVectorProductCuda( matrix, inVector, outVector); #endif } -- GitLab From 9a12aa3d1f1cc82be593d40398b9795e3c21f9bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 15 Apr 2020 18:27:26 +0200 Subject: [PATCH 03/57] Added libcudadevrt to fix problem with linking. --- src/Benchmarks/BLAS/CMakeLists.txt | 1 + src/Benchmarks/SpMV/CMakeLists.txt | 2 +- src/TNL/Matrices/Legacy/CSR.h | 2 +- src/TNL/Matrices/Legacy/CSR_impl.h | 8 +++++--- src/UnitTests/Matrices/CMakeLists.txt | 2 +- src/UnitTests/Matrices/Legacy/CMakeLists.txt | 2 +- 6 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt index 81d837533..9017a14fb 100644 --- a/src/Benchmarks/BLAS/CMakeLists.txt +++ b/src/Benchmarks/BLAS/CMakeLists.txt @@ -1,6 +1,7 @@ if( BUILD_CUDA ) cuda_add_executable( tnl-benchmark-blas tnl-benchmark-blas.cu ) cuda_add_cublas_to_target( tnl-benchmark-blas ) + target_link_libraries( tnl-benchmark-blas ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a ) else() add_executable( tnl-benchmark-blas tnl-benchmark-blas.cpp ) endif() diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt index 7cb9c4fcd..7357a3492 100644 --- a/src/Benchmarks/SpMV/CMakeLists.txt +++ b/src/Benchmarks/SpMV/CMakeLists.txt @@ -1,6 +1,6 @@ if( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu ) - TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ) + TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a ) else() ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp ) endif() diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 46e616d16..a08f914dd 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -233,7 +233,7 @@ public: __device__ void vectorProductCuda( const InVector& inVector, OutVector& outVector, - int gridIdx, int *blocks, size_t size ) const; + int gridIdx ) const; template< typename InVector, typename OutVector, diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index ff8e57571..19f4ba912 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -13,10 +13,12 @@ #include #include #include +#include #include #include #ifdef HAVE_CUSPARSE +#include #include #endif @@ -898,7 +900,7 @@ void spmvCSRVectorHelper( const InVector& inVector, continue; result += values[i] * inVector[column]; } - atomicAdd(out, result); + //Algorithms::AtomicOperations< Devices::Cuda >::add(out, result); TODO: fix } template< typename Real, @@ -1371,6 +1373,7 @@ class CSRDeviceDependentCode< Devices::Cuda > const InVector& inVector, OutVector& outVector ) { +#ifdef HAVE_CUDA #ifdef HAVE_CUSPARSE tnlCusparseCSRWrapper< Real, Index >::vectorProduct( matrix.getRows(), matrix.getColumns(), @@ -1381,7 +1384,6 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - // #ifdef HAVE_CUDA // if (KernelType == CSRAdaptive) { if (sizeof(Index) != 4) { printf("Size of Index type is too small!\n"); @@ -1462,7 +1464,7 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getColumns() ); // } else - // #endif /* HAVE_CUDA */ +#endif /* HAVE_CUDA */ // CSRVectorProductCuda( matrix, inVector, outVector); #endif } diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index eb8e2e1d5..c88f565eb 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -137,7 +137,7 @@ if( ${BUILD_MPI} ) if( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( DistributedMatrixTest DistributedMatrixTest.cu OPTIONS ${CXX_TESTS_FLAGS} ) - TARGET_LINK_LIBRARIES( DistributedMatrixTest ${GTEST_BOTH_LIBRARIES} ) + TARGET_LINK_LIBRARIES( DistributedMatrixTest ${GTEST_BOTH_LIBRARIES} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a ) else() ADD_EXECUTABLE( DistributedMatrixTest DistributedMatrixTest.cpp ) TARGET_COMPILE_OPTIONS( DistributedMatrixTest PRIVATE ${CXX_TESTS_FLAGS} ) diff --git a/src/UnitTests/Matrices/Legacy/CMakeLists.txt b/src/UnitTests/Matrices/Legacy/CMakeLists.txt index 46c6be2cd..d47b07e19 100644 --- a/src/UnitTests/Matrices/Legacy/CMakeLists.txt +++ b/src/UnitTests/Matrices/Legacy/CMakeLists.txt @@ -15,7 +15,7 @@ IF( BUILD_CUDA ) TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_ChunkedEllpack ${GTEST_BOTH_LIBRARIES} ) CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_CSR SparseMatrixTest_CSR.cu OPTIONS ${CXX_TESTS_FLAGS} ) - TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} ) + TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a ) CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_Ellpack SparseMatrixTest_Ellpack.cu OPTIONS ${CXX_TESTS_FLAGS} ) TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_Ellpack ${GTEST_BOTH_LIBRARIES} ) -- GitLab From c857a8dc4fca99de11152fad28f2561700f170d0 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Tue, 21 Apr 2020 23:03:37 +0200 Subject: [PATCH 04/57] Added CSR Vector with dynamic parallelism --- src/TNL/Matrices/Legacy/CSR_impl.h | 243 ++++++++++-------- .../Matrices/Legacy/SparseMatrixTest.hpp | 126 +++++---- .../Matrices/Legacy/SparseMatrixTest_CSR.h | 40 +-- 3 files changed, 210 insertions(+), 199 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 19f4ba912..01e8be880 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -794,51 +794,51 @@ void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& in size_t blocks_size) const { /* Configuration ---------------------------------------------------*/ - constexpr size_t SHARED = 49152/sizeof(float); + constexpr size_t SHARED = 49152/sizeof(Real); constexpr size_t SHARED_PER_WARP = SHARED / warpSize; constexpr size_t MAX_PER_WARP = 65536; //constexpr size_t ELEMENTS_PER_WARP = 1024; //constexpr size_t THREADS_PER_BLOCK = 1024; //constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize; //-------------------------------------------------------------------- - const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; - const IndexType laneID = index % warpSize; - IndexType blockIdx = index / warpSize; - __shared__ float shared_res[SHARED]; + const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t laneID = index % warpSize; + size_t blockIdx = index / warpSize; + __shared__ Real shared_res[SHARED]; Real result = 0.0; if (blockIdx >= blocks_size - 1) return; - const IndexType minRow = blocks[blockIdx]; - const IndexType maxRow = blocks[blockIdx + 1]; - const IndexType minID = this->rowPointers[minRow]; - const IndexType maxID = this->rowPointers[maxRow]; - const IndexType elements = maxID - minID; + const size_t minRow = blocks[blockIdx]; + const size_t maxRow = blocks[blockIdx + 1]; + const size_t minID = this->rowPointers[minRow]; + const size_t maxID = this->rowPointers[maxRow]; + const size_t elements = maxID - minID; /* rows per block more than 1 */ if ((maxRow - minRow) > 1) { /////////////////////////////////////* CSR STREAM *////////////// /* Copy and calculate elements from global to shared memory, coalesced */ - const IndexType offset = threadIdx.x / warpSize * SHARED_PER_WARP; - for (IndexType i = laneID; i < elements; i += warpSize) { - const IndexType elementIdx = i + minID; - const IndexType column = this->columnIndexes[elementIdx]; + const size_t offset = threadIdx.x / warpSize * SHARED_PER_WARP; + for (size_t i = laneID; i < elements; i += warpSize) { + const size_t elementIdx = i + minID; + const size_t column = this->columnIndexes[elementIdx]; if (column >= this->getColumns()) continue; shared_res[i + offset] = this->values[elementIdx] * inVector[column]; } - const IndexType row = minRow + laneID; + const size_t row = minRow + laneID; if (row >= maxRow) return; /* Calculate result */ - const IndexType to = this->rowPointers[row + 1] - minID; - for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) { + const size_t to = this->rowPointers[row + 1] - minID; + for (size_t i = this->rowPointers[row] - minID; i < to; ++i) { result += shared_res[i + offset]; } outVector[row] = result; // Write result - } else if (elements <= MAX_PER_WARP) { + } else { /////////////////////////////////////* CSR VECTOR *////////////// - for (IndexType i = minID + laneID; i < maxID; i += warpSize) { - IndexType column = this->columnIndexes[i]; + for (size_t i = minID + laneID; i < maxID; i += warpSize) { + size_t column = this->columnIndexes[i]; if (column >= this->getColumns()) break; @@ -866,24 +866,19 @@ void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& in } } -// __global__ -// void spmvCSRVectorHelper() { - -// } - template< typename Real, typename Index, typename InVector, int warpSize > __global__ -void spmvCSRVectorHelper( const InVector& inVector, - const int* columnIndexes, - const float *values, - const int getColumns, - Real *out, - size_t from, - size_t to, - size_t perWarp) +void spmvCSRVectorHelper(const InVector& inVector, + const int* columnIndexes, + const Real *values, + const int getColumns, + Real *out, + size_t from, + size_t to, + size_t perWarp) { const size_t index = blockIdx.x * blockDim.x + threadIdx.x; const size_t warpID = index / warpSize; @@ -897,10 +892,11 @@ void spmvCSRVectorHelper( const InVector& inVector, for (size_t i = minID + laneID; i < maxID; i += warpSize) { const size_t column = columnIndexes[i]; if (column >= getColumns) - continue; + break; result += values[i] * inVector[column]; } - //Algorithms::AtomicOperations< Devices::Cuda >::add(out, result); TODO: fix + + atomicAdd(out, result); } template< typename Real, @@ -913,60 +909,59 @@ void SpMVCSRAdaptiveGlobal( const InVector& inVector, OutVector& outVector, const int* rowPointers, const int* columnIndexes, - const float* values, + const Real* values, int *blocks, size_t blocks_size, - Index getColumns - ) + Index getColumns) { /* Configuration ---------------------------------------------------*/ - constexpr size_t SHARED = 49152/sizeof(float); - constexpr size_t SHARED_PER_WARP = SHARED / warpSize; - constexpr size_t MAX_PER_WARP = 65536; - constexpr size_t ELEMENTS_PER_WARP = 1024; + constexpr size_t SHARED = 49152/sizeof(Real); // number of elements in shared memory for block constexpr size_t THREADS_PER_BLOCK = 1024; constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize; + constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; + constexpr size_t MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic + constexpr size_t ELEMENTS_PER_WARP = 1024; // how many elements should process new warp //-------------------------------------------------------------------- - const Index index = blockIdx.x * blockDim.x + threadIdx.x; - const Index laneID = index % warpSize; - const Index blockIdx = index / warpSize; - __shared__ float shared_res[SHARED]; - float result = 0.0; + const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t laneID = index % warpSize; + const size_t blockIdx = index / warpSize; + __shared__ Real shared_res[SHARED]; + Real result = 0.0; if (blockIdx >= blocks_size - 1) return; - const Index minRow = blocks[blockIdx]; - const Index maxRow = blocks[blockIdx + 1]; - const Index minID = rowPointers[minRow]; - const Index maxID = rowPointers[maxRow]; - const Index elements = maxID - minID; + const size_t minRow = blocks[blockIdx]; + const size_t maxRow = blocks[blockIdx + 1]; + const size_t minID = rowPointers[minRow]; + const size_t maxID = rowPointers[maxRow]; + const size_t elements = maxID - minID; /* rows per block more than 1 */ if ((maxRow - minRow) > 1) { /////////////////////////////////////* CSR STREAM *////////////// /* Copy and calculate elements from global to shared memory, coalesced */ - const Index offset = threadIdx.x / warpSize * SHARED_PER_WARP; - for (Index i = laneID; i < elements; i += warpSize) { - const Index elementIdx = i + minID; - const Index column = columnIndexes[elementIdx]; + const size_t offset = threadIdx.x / warpSize * SHARED_PER_WARP; + for (size_t i = laneID; i < elements; i += warpSize) { + const size_t elementIdx = i + minID; + const size_t column = columnIndexes[elementIdx]; if (column >= getColumns) continue; shared_res[i + offset] = values[elementIdx] * inVector[column]; } - const Index row = minRow + laneID; + const size_t row = minRow + laneID; if (row >= maxRow) return; /* Calculate result */ - const Index to = rowPointers[row + 1] - minID; - for (Index i = rowPointers[row] - minID; i < to; ++i) { + const size_t to = rowPointers[row + 1] - minID; + for (size_t i = rowPointers[row] - minID; i < to; ++i) { result += shared_res[i + offset]; } outVector[row] = result; // Write result } else if (elements <= MAX_PER_WARP) { /////////////////////////////////////* CSR VECTOR *////////////// - for (Index i = minID + laneID; i < maxID; i += warpSize) { - Index column = columnIndexes[i]; + for (size_t i = minID + laneID; i < maxID; i += warpSize) { + size_t column = columnIndexes[i]; if (column >= getColumns) break; @@ -980,21 +975,40 @@ void SpMVCSRAdaptiveGlobal( const InVector& inVector, result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); if (laneID == 0) outVector[minRow] = result; // Write result } - else { - /////////////////////////////////////* CSR VECTOR LONG *////////////// - const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1; - const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1; - const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize; - spmvCSRVectorHelper <<>>( - inVector, - columnIndexes, - values, - getColumns, - &outVector[minRow], - (size_t)(minID + ELEMENTS_PER_WARP), - (size_t)maxID, - (size_t)ELEMENTS_PER_WARP - ); + else { // too long row + /////////////////////////////////////* CSR DYNAMIC VECTOR *////////////// + + /* Number of warps we need. + This warp can be used to calculate result too, -1 warp */ + size_t warps = elements / ELEMENTS_PER_WARP; + warps = elements % ELEMENTS_PER_WARP ? warps : warps - 1; + + size_t blocks = warps / WARPS_PER_BLOCK; + blocks = warps % WARPS_PER_BLOCK ? blocks + 1 : blocks; + + /* Execute a lot of CSR Vector */ + if (laneID == 0) { + spmvCSRVectorHelper <<>>( + inVector, + columnIndexes, + values, + getColumns, + &outVector[minRow], + minID + ELEMENTS_PER_WARP, + maxID, + ELEMENTS_PER_WARP + ); + } + /* CSR Vector */ + for (size_t i = minID + laneID; i < minID + ELEMENTS_PER_WARP; i += warpSize) { + size_t column = columnIndexes[i]; + if (column >= getColumns) + break; + + result += values[i] * inVector[column]; + } + /* Write result */ + atomicAdd(&outVector[minRow], result); } } @@ -1061,6 +1075,8 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& { case CSRScalar: // TODO: + /* FIXME */ + spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); break; case CSRVector: spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); @@ -1075,6 +1091,8 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& break; case CSRStream: // TODO: + /* FIXME */ + spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); break; } @@ -1144,17 +1162,6 @@ class CSRDeviceDependentCode< Devices::Host > #ifdef HAVE_CUDA -// template< typename Real, -// typename Index, -// CSRKernel KernelType, -// typename InVector, -// typename OutVector, -// int warpSize > -// __global__ -// void CSRScalarGlobal(const Containers::Vector< Index, Devices::Cuda, Index, Allocators::Cuda >* row) -// { -// } - template< typename Real, typename Index, CSRKernel KernelType, @@ -1385,17 +1392,16 @@ class CSRDeviceDependentCode< Devices::Cuda > outVector.getData() ); #else // if (KernelType == CSRAdaptive) { - if (sizeof(Index) != 4) { - printf("Size of Index type is too small!\n"); - return; - } - - constexpr int SHARED = 49152/sizeof(float); - constexpr int SHARED_PER_WARP = SHARED / 32; + /* Configuration ---------------------------------------------------*/ + constexpr size_t SHARED = 49152/sizeof(Real); + constexpr size_t THREADS_PER_BLOCK = 1024; + constexpr size_t SHARED_PER_WARP = SHARED / (THREADS_PER_BLOCK / 32); + //-------------------------------------------------------------------- + /* Fill in blocks */ std::vector inBlock; inBlock.push_back(0); size_t sum = 0; - Index i; + size_t i; int prev_i = 0; for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { size_t elements = matrix.getRowPointers().getElement(i) - @@ -1422,47 +1428,56 @@ class CSRDeviceDependentCode< Devices::Cuda > const InVector *kernelInVector = Cuda::passToDevice( inVector ); OutVector *kernelOutVector = Cuda::passToDevice( outVector ); - CSR< Real, Device, Index, KernelType >* kernel_this = Cuda::passToDevice( matrix ); - + /* blocks */ int *kernelBlocks; cudaMalloc((void **)&kernelBlocks, sizeof(int) * inBlock.size()); cudaMemcpy(kernelBlocks, inBlock.data(), inBlock.size() * sizeof(int), cudaMemcpyHostToDevice); - + /* values */ - float *kernel_values; - cudaMalloc((void **)&kernel_values, sizeof(float) * matrix.getValues().getSize()); - cudaMemcpy(kernel_values, - (float *)matrix.getValues().getData(), - matrix.getValues().getSize() * sizeof(float), + Real *kernelValues; + cudaMalloc((void **)&kernelValues, sizeof(Real) * matrix.getValues().getSize()); + cudaMemcpy(kernelValues, + (Real *)matrix.getValues().getData(), + matrix.getValues().getSize() * sizeof(Real), cudaMemcpyHostToDevice); /* columns */ - int *kernel_columns; - cudaMalloc((void **)&kernel_columns, sizeof(int) * matrix.getColumnIndexes().getSize()); - cudaMemcpy(kernel_columns, + int *kernelColumns; + cudaMalloc((void **)&kernelColumns, sizeof(int) * matrix.getColumnIndexes().getSize()); + cudaMemcpy(kernelColumns, (int *)matrix.getColumnIndexes().getData(), matrix.getColumnIndexes().getSize() * sizeof(int), cudaMemcpyHostToDevice); /* row pointers */ - int *kernel_rowPointers; - cudaMalloc((void **)&kernel_rowPointers, sizeof(int) * matrix.getRowPointers().getSize()); - cudaMemcpy(kernel_rowPointers, + int *kernelRowPointers; + cudaMalloc((void **)&kernelRowPointers, sizeof(int) * matrix.getRowPointers().getSize()); + cudaMemcpy(kernelRowPointers, (int *)matrix.getRowPointers().getData(), matrix.getRowPointers().getSize() * sizeof(int), cudaMemcpyHostToDevice); - - SpMVCSRAdaptiveGlobal< Real, Index, InVector, OutVector, 32 ><<<2, 1024>>>( + + size_t needed_threads = 32 * (inBlock.size() - 1); // number of threads we need + size_t blocks = needed_threads / THREADS_PER_BLOCK; // warp per block + blocks = needed_threads % THREADS_PER_BLOCK ? blocks + 1 : blocks; + + SpMVCSRAdaptiveGlobal< Real, Index, InVector, OutVector, 32 ><<>>( *kernelInVector, *kernelOutVector, - kernel_rowPointers, - kernel_columns, - kernel_values, + kernelRowPointers, + kernelColumns, + kernelValues, kernelBlocks, inBlock.size(), matrix.getColumns() ); + + cudaFree(kernelBlocks); + cudaFree(kernelValues); + cudaFree(kernelColumns); + cudaFree(kernelRowPointers); + // } else #endif /* HAVE_CUDA */ // CSRVectorProductCuda( matrix, inVector, outVector); diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp index 98ddfd3db..09368b969 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp @@ -1386,88 +1386,84 @@ void test_VectorProductLarger() } template< typename Matrix > -void test_VectorProductGiant() +void test_VectorProductCSRAdaptive() { - using RealType = typename Matrix::RealType; - using DeviceType = typename Matrix::DeviceType; - using IndexType = typename Matrix::IndexType; - - IndexType m_rows = 100; - IndexType m_cols = 100; - - Matrix m; - m.reset(); - m.setDimensions( m_rows, m_cols ); - typename Matrix::CompressedRowLengthsVector rowLengths( - { - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 - } - ); + using RealType = typename Matrix::RealType; + using DeviceType = typename Matrix::DeviceType; + using IndexType = typename Matrix::IndexType; - m.setCompressedRowLengths( rowLengths ); - - for (int i = 0; i < m_rows; ++i) - for (int j = 0; j < m_cols; ++j) + //----------------- Test CSR Stream part ------------------ + IndexType m_rows = 100; + IndexType m_cols = 100; + + Matrix m; + m.reset(); + m.setDimensions( m_rows, m_cols ); + typename Matrix::CompressedRowLengthsVector rowLengths( + { + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 + } + ); + + m.setCompressedRowLengths( rowLengths ); + + for (int i = 0; i < m_rows; ++i) + for (int j = 0; j < m_cols; ++j) m.setElement( i, j, i + 1 ); - using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; - - VectorType inVector; - inVector.setSize( m_rows ); - for( IndexType i = 0; i < inVector.getSize(); ++i ) + using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; + + VectorType inVector; + inVector.setSize( m_rows ); + for( IndexType i = 0; i < inVector.getSize(); ++i ) inVector.setElement( i, 1 ); - VectorType outVector; - outVector.setSize( m_rows ); - for( IndexType i = 0; i < outVector.getSize(); ++i ) + VectorType outVector; + outVector.setSize( m_rows ); + for( IndexType i = 0; i < outVector.getSize(); ++i ) outVector.setElement( i, 0 ); + + m.vectorProduct( inVector, outVector); - m.vectorProduct( inVector, outVector); - - for (int i = 0; i < m_rows; ++i) + for (int i = 0; i < m_rows; ++i) EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 ); - //----------------------------------------------------- + //----------------- Test CSR Dynamic Vector part ------------------ - m_rows = 2; - m_cols = 1000; - - m.reset(); - m.setDimensions( m_rows, m_cols ); - typename Matrix::CompressedRowLengthsVector rowLengths2( - { - 1000, 1000 - } - ); + m_rows = 1; + // if less than 'max elements per block to start CSR Dynamic Vector' tests CSR Vector part + m_cols = 3000; - m.setCompressedRowLengths( rowLengths2 ); - - for (int i = 0; i < m_rows; ++i) - for (int j = 0; j < m_cols; ++j) - m.setElement( i, j, i + 1 ); + m.reset(); + m.setDimensions( m_rows, m_cols ); + typename Matrix::CompressedRowLengthsVector rowLengths2({m_cols}); + + m.setCompressedRowLengths( rowLengths2 ); + + for (int i = 0; i < m_cols; ++i) + m.setElement( 0, i, 2 ); - VectorType inVector2; - inVector2.setSize( m_cols ); - for( IndexType i = 0; i < inVector2.getSize(); i++ ) + VectorType inVector2; + inVector2.setSize( m_cols ); + for( IndexType i = 0; i < inVector2.getSize(); i++ ) inVector2.setElement( i, 1 ); - VectorType outVector2; - outVector2.setSize( m_rows ); - for( IndexType i = 0; i < outVector2.getSize(); ++i ) + VectorType outVector2; + outVector2.setSize( m_rows ); + for( IndexType i = 0; i < outVector2.getSize(); ++i ) outVector2.setElement( i, 0 ); - m.vectorProduct( inVector2, outVector2); - for (int i = 0; i < m_rows; ++i) - EXPECT_EQ( outVector2.getElement( i ), (i + 1) * 1000 ); + m.vectorProduct(inVector2, outVector2); + EXPECT_EQ( outVector2.getElement( 0 ), 6000 ); } template< typename Matrix > diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h index e9c3f591c..feeea216c 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h @@ -27,23 +27,23 @@ protected: // types for which MatrixTest is instantiated using CSRMatrixTypes = ::testing::Types < - TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, int >, - TNL::Matrices::Legacy::CSR< long, TNL::Devices::Host, int >, - TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, int >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >, - TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, long >, - TNL::Matrices::Legacy::CSR< long, TNL::Devices::Host, long >, - TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, long >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long > + // TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, int >, + // TNL::Matrices::Legacy::CSR< long, TNL::Devices::Host, int >, + // TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, int >, + // TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >, + // TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, long >, + // TNL::Matrices::Legacy::CSR< long, TNL::Devices::Host, long >, + // TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, long >, + // TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long > #ifdef HAVE_CUDA - ,TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int >, - TNL::Matrices::Legacy::CSR< long, TNL::Devices::Cuda, int >, + // ,TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int >, + // TNL::Matrices::Legacy::CSR< long, TNL::Devices::Cuda, int >, TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >, - TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long >, - TNL::Matrices::Legacy::CSR< long, TNL::Devices::Cuda, long >, - TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long > + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int > + // TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long > + // TNL::Matrices::Legacy::CSR< long, TNL::Devices::Cuda, long >, + // TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long > + // TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long > #endif >; @@ -105,12 +105,12 @@ TYPED_TEST( CSRMatrixTest, setRowTest ) test_SetRow< CSRMatrixType >(); } -TYPED_TEST( CSRMatrixTest, vectorProductTest ) +/* TYPED_TEST( CSRMatrixTest, vectorProductTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; test_VectorProduct< CSRMatrixType >(); -} +} */ /*TYPED_TEST( CSRMatrixTest, vectorProductLargerTest ) { @@ -119,12 +119,12 @@ TYPED_TEST( CSRMatrixTest, vectorProductTest ) test_VectorProductLarger< CSRMatrixType >(); }*/ -/*TYPED_TEST( CSRMatrixTest, vectorProductGiantTest ) +TYPED_TEST( CSRMatrixTest, vectorProductCSRApadtiveTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; - test_VectorProductGiant< CSRMatrixType >(); -}*/ + test_VectorProductCSRAdaptive< CSRMatrixType >(); +} TYPED_TEST( CSRMatrixTest, saveAndLoadTest ) { -- GitLab From d945c433fe6ca2b99c575460cfe5c6dda073b6d8 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Mon, 27 Apr 2020 23:48:28 +0200 Subject: [PATCH 05/57] Fixed bugs --- src/TNL/Matrices/Legacy/CSR_impl.h | 70 +++++++++++-------- .../Matrices/Legacy/SparseMatrixTest_CSR.h | 24 +++---- 2 files changed, 52 insertions(+), 42 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 01e8be880..821d11dec 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -22,6 +22,19 @@ #include #endif +/* CONFIGURATION */ +constexpr size_t WARP_SIZE = 32; +constexpr size_t THREADS_PER_BLOCK = 1024; +constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + +/* CSR DYNAMIC VECTOR */ +constexpr size_t MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic +constexpr size_t ELEMENTS_PER_WARP = 1024; // how many elements should process new warp + +/* CSR Light SPMV */ +constexpr size_t THREADS_PER_ROW = 4; // how many elements should process new warp +//------------------------------------- + namespace TNL { namespace Matrices { namespace Legacy { @@ -731,11 +744,11 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& OutVector& outVector, int gridIdx) const { - const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; - const IndexType elemPerGroup = 4; - const IndexType laneID = index % 32; - const IndexType groupID = laneID / elemPerGroup; - const IndexType inGroupID = laneID % elemPerGroup; + const IndexType index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const IndexType THREADS_PER_ROW = 4; + const IndexType laneID = index % warpSize; + const IndexType groupID = laneID / THREADS_PER_ROW; + const IndexType inGroupID = laneID % THREADS_PER_ROW; IndexType row, minID, column, maxID, idxMtx; __shared__ unsigned rowCnt; @@ -749,7 +762,7 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& if (inGroupID == 0) row = atomicAdd(&rowCnt, 1); /* Propagate row number in group */ - row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerGroup); + row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * THREADS_PER_ROW); if (row >= this->rowPointers.getSize() - 1) return; @@ -766,11 +779,11 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& break; result += this->values[idxMtx] * inVector[column]; - idxMtx += elemPerGroup; + idxMtx += THREADS_PER_ROW; } /* Parallel reduction */ - for (int i = elemPerGroup/2; i > 0; i /= 2) + for (int i = THREADS_PER_ROW / 2; i > 0; i /= 2) result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); /* Write result */ if (inGroupID == 0) { @@ -801,7 +814,7 @@ void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& in //constexpr size_t THREADS_PER_BLOCK = 1024; //constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize; //-------------------------------------------------------------------- - const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const size_t laneID = index % warpSize; size_t blockIdx = index / warpSize; __shared__ Real shared_res[SHARED]; @@ -872,9 +885,9 @@ template< typename Real, int warpSize > __global__ void spmvCSRVectorHelper(const InVector& inVector, - const int* columnIndexes, + const Index* columnIndexes, const Real *values, - const int getColumns, + const Index getColumns, Real *out, size_t from, size_t to, @@ -888,7 +901,7 @@ void spmvCSRVectorHelper(const InVector& inVector, if (minID >= to) return; if (maxID >= to ) maxID = to; - Real result = 0.0; + Real result = 0; for (size_t i = minID + laneID; i < maxID; i += warpSize) { const size_t column = columnIndexes[i]; if (column >= getColumns) @@ -907,8 +920,8 @@ template< typename Real, __global__ void SpMVCSRAdaptiveGlobal( const InVector& inVector, OutVector& outVector, - const int* rowPointers, - const int* columnIndexes, + const Index* rowPointers, + const Index* columnIndexes, const Real* values, int *blocks, size_t blocks_size, @@ -916,17 +929,13 @@ void SpMVCSRAdaptiveGlobal( const InVector& inVector, { /* Configuration ---------------------------------------------------*/ constexpr size_t SHARED = 49152/sizeof(Real); // number of elements in shared memory for block - constexpr size_t THREADS_PER_BLOCK = 1024; - constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize; constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; - constexpr size_t MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic - constexpr size_t ELEMENTS_PER_WARP = 1024; // how many elements should process new warp //-------------------------------------------------------------------- const size_t index = blockIdx.x * blockDim.x + threadIdx.x; const size_t laneID = index % warpSize; const size_t blockIdx = index / warpSize; __shared__ Real shared_res[SHARED]; - Real result = 0.0; + Real result = 0; if (blockIdx >= blocks_size - 1) return; const size_t minRow = blocks[blockIdx]; @@ -1394,8 +1403,7 @@ class CSRDeviceDependentCode< Devices::Cuda > // if (KernelType == CSRAdaptive) { /* Configuration ---------------------------------------------------*/ constexpr size_t SHARED = 49152/sizeof(Real); - constexpr size_t THREADS_PER_BLOCK = 1024; - constexpr size_t SHARED_PER_WARP = SHARED / (THREADS_PER_BLOCK / 32); + constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; //-------------------------------------------------------------------- /* Fill in blocks */ std::vector inBlock; @@ -1425,7 +1433,7 @@ class CSRDeviceDependentCode< Devices::Cuda > } } inBlock.push_back(matrix.getRowPointers().getSize() - 1); - + /* Copy memory to GPU */ const InVector *kernelInVector = Cuda::passToDevice( inVector ); OutVector *kernelOutVector = Cuda::passToDevice( outVector ); @@ -1443,19 +1451,19 @@ class CSRDeviceDependentCode< Devices::Cuda > cudaMemcpyHostToDevice); /* columns */ - int *kernelColumns; - cudaMalloc((void **)&kernelColumns, sizeof(int) * matrix.getColumnIndexes().getSize()); + Index *kernelColumns; + cudaMalloc((void **)&kernelColumns, sizeof(Index) * matrix.getColumnIndexes().getSize()); cudaMemcpy(kernelColumns, - (int *)matrix.getColumnIndexes().getData(), - matrix.getColumnIndexes().getSize() * sizeof(int), + (Index *)matrix.getColumnIndexes().getData(), + matrix.getColumnIndexes().getSize() * sizeof(Index), cudaMemcpyHostToDevice); /* row pointers */ - int *kernelRowPointers; - cudaMalloc((void **)&kernelRowPointers, sizeof(int) * matrix.getRowPointers().getSize()); + Index *kernelRowPointers; + cudaMalloc((void **)&kernelRowPointers, sizeof(Index) * matrix.getRowPointers().getSize()); cudaMemcpy(kernelRowPointers, - (int *)matrix.getRowPointers().getData(), - matrix.getRowPointers().getSize() * sizeof(int), + (Index *)matrix.getRowPointers().getData(), + matrix.getRowPointers().getSize() * sizeof(Index), cudaMemcpyHostToDevice); size_t needed_threads = 32 * (inBlock.size() - 1); // number of threads we need @@ -1473,6 +1481,8 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getColumns() ); + Cuda::freeFromDevice( kernelInVector ); + Cuda::freeFromDevice( kernelOutVector ); cudaFree(kernelBlocks); cudaFree(kernelValues); cudaFree(kernelColumns); diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h index feeea216c..0cf205929 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h @@ -27,23 +27,23 @@ protected: // types for which MatrixTest is instantiated using CSRMatrixTypes = ::testing::Types < - // TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, int >, - // TNL::Matrices::Legacy::CSR< long, TNL::Devices::Host, int >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, int >, + TNL::Matrices::Legacy::CSR< long, TNL::Devices::Host, int >, // TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, int >, - // TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >, - // TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, long >, - // TNL::Matrices::Legacy::CSR< long, TNL::Devices::Host, long >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, long >, + TNL::Matrices::Legacy::CSR< long, TNL::Devices::Host, long >, // TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, long >, - // TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long > + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long > #ifdef HAVE_CUDA - // ,TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int >, - // TNL::Matrices::Legacy::CSR< long, TNL::Devices::Cuda, int >, + ,TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int >, + // TNL::Matrices::Legacy::CSR< long, TNL::Devices::Cuda, int >, // cuda atomicAdd has no support for long, only unsigned long long int TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int > - // TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long > + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long >, // TNL::Matrices::Legacy::CSR< long, TNL::Devices::Cuda, long >, - // TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long > - // TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long > + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long > #endif >; -- GitLab From 6c7542abc9c2a5ad785ce9e51a7eb6fff949de4d Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Wed, 20 May 2020 16:28:08 +0200 Subject: [PATCH 06/57] Possible fix for compilation --- src/TNL/Matrices/Legacy/CSR_impl.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 821d11dec..ccfbe6f0f 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #ifdef HAVE_CUSPARSE @@ -745,7 +746,6 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& int gridIdx) const { const IndexType index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const IndexType THREADS_PER_ROW = 4; const IndexType laneID = index % warpSize; const IndexType groupID = laneID / THREADS_PER_ROW; const IndexType inGroupID = laneID % THREADS_PER_ROW; @@ -1409,7 +1409,7 @@ class CSRDeviceDependentCode< Devices::Cuda > std::vector inBlock; inBlock.push_back(0); size_t sum = 0; - size_t i; + int i; int prev_i = 0; for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { size_t elements = matrix.getRowPointers().getElement(i) - @@ -1437,12 +1437,12 @@ class CSRDeviceDependentCode< Devices::Cuda > const InVector *kernelInVector = Cuda::passToDevice( inVector ); OutVector *kernelOutVector = Cuda::passToDevice( outVector ); - /* blocks */ + /* blocks to GPU */ int *kernelBlocks; cudaMalloc((void **)&kernelBlocks, sizeof(int) * inBlock.size()); cudaMemcpy(kernelBlocks, inBlock.data(), inBlock.size() * sizeof(int), cudaMemcpyHostToDevice); - /* values */ + /* values to GPU */ Real *kernelValues; cudaMalloc((void **)&kernelValues, sizeof(Real) * matrix.getValues().getSize()); cudaMemcpy(kernelValues, @@ -1450,7 +1450,7 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getValues().getSize() * sizeof(Real), cudaMemcpyHostToDevice); - /* columns */ + /* columns to GPU */ Index *kernelColumns; cudaMalloc((void **)&kernelColumns, sizeof(Index) * matrix.getColumnIndexes().getSize()); cudaMemcpy(kernelColumns, @@ -1458,7 +1458,7 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getColumnIndexes().getSize() * sizeof(Index), cudaMemcpyHostToDevice); - /* row pointers */ + /* row pointers to GPU */ Index *kernelRowPointers; cudaMalloc((void **)&kernelRowPointers, sizeof(Index) * matrix.getRowPointers().getSize()); cudaMemcpy(kernelRowPointers, @@ -1469,7 +1469,6 @@ class CSRDeviceDependentCode< Devices::Cuda > size_t needed_threads = 32 * (inBlock.size() - 1); // number of threads we need size_t blocks = needed_threads / THREADS_PER_BLOCK; // warp per block blocks = needed_threads % THREADS_PER_BLOCK ? blocks + 1 : blocks; - SpMVCSRAdaptiveGlobal< Real, Index, InVector, OutVector, 32 ><<>>( *kernelInVector, *kernelOutVector, @@ -1480,7 +1479,8 @@ class CSRDeviceDependentCode< Devices::Cuda > inBlock.size(), matrix.getColumns() ); - + + /* Free memory */ Cuda::freeFromDevice( kernelInVector ); Cuda::freeFromDevice( kernelOutVector ); cudaFree(kernelBlocks); -- GitLab From 12a20095fc644705e3510c5587e9868b557be0ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 4 Jun 2020 19:37:02 +0200 Subject: [PATCH 07/57] Fix of SpMV benchmark and update of Python script for SpMV benchmark results processing. --- src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py index 229e32cc2..c7e733d8e 100755 --- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py +++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py @@ -62,6 +62,7 @@ df.sort_index(axis=1, inplace=True) df.drop(columns=('BiEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('BiEllpack', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('CSR', 'CPU','speedup'), axis=1, inplace=True ) + #df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True ) #df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True ) #df.drop(columns=('CSR Legacy LightWithoutAtomic', 'CPU','speedup'), axis=1, inplace=True ) @@ -69,6 +70,7 @@ df.drop(columns=('CSR', 'CPU','speedup'), axis=1, inplace=True ) #df.drop(columns=('CSR Legacy Stream', 'CPU','speedup'), axis=1, inplace=True ) #df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True ) #df.drop(columns=('CSR Legacy MultiVector', 'CPU','speedup'), axis=1, inplace=True ) + df.drop(columns=('ChunkedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('Ellpack', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('Ellpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) @@ -82,6 +84,7 @@ df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True print( "Computing speed-up of formats...") # Add speedup compared to CSR and cuSparse + df["BiEllpack Legacy", "CPU", "CSR speedup"] = df["BiEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] df["BiEllpack Legacy", "GPU", "cuSparse speedup"] = df["BiEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] df["BiEllpack", "CPU", "CSR speedup"] = df["BiEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] -- GitLab From ab7d18308f4fd187b4775041e911cda431a85bef Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Wed, 10 Jun 2020 13:04:40 +0200 Subject: [PATCH 08/57] Refactoring, added CSR MultiVector(a lot of warps for one row) --- src/TNL/Matrices/Legacy/CSR.h | 10 +- src/TNL/Matrices/Legacy/CSR_impl.h | 687 ++++++++++++++++++++++------- 2 files changed, 530 insertions(+), 167 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index a08f914dd..25ddca7cf 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -31,7 +31,7 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; -enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream }; +enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream, CSRMultiVector }; template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar > class CSR : public Sparse< Real, Device, Index > @@ -226,14 +226,6 @@ public: void spmvCudaVectorized( const InVector& inVector, OutVector& outVector, const IndexType gridIdx ) const; - - template< typename InVector, - typename OutVector, - int warpSize > - __device__ - void vectorProductCuda( const InVector& inVector, - OutVector& outVector, - int gridIdx ) const; template< typename InVector, typename OutVector, diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index ccfbe6f0f..6af6f8565 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -27,7 +27,8 @@ constexpr size_t WARP_SIZE = 32; constexpr size_t THREADS_PER_BLOCK = 1024; constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; - +constexpr size_t MAX_X_DIM = 2147483647; +constexpr size_t MAX_GRID_SIZE = MAX_X_DIM * THREADS_PER_BLOCK; /* CSR DYNAMIC VECTOR */ constexpr size_t MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic constexpr size_t ELEMENTS_PER_WARP = 1024; // how many elements should process new warp @@ -881,10 +882,9 @@ void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& in template< typename Real, typename Index, - typename InVector, int warpSize > __global__ -void spmvCSRVectorHelper(const InVector& inVector, +void spmvCSRVectorHelper(const Real *inVector, const Index* columnIndexes, const Real *values, const Index getColumns, @@ -914,24 +914,23 @@ void spmvCSRVectorHelper(const InVector& inVector, template< typename Real, typename Index, - typename InVector, - typename OutVector, int warpSize > __global__ -void SpMVCSRAdaptiveGlobal( const InVector& inVector, - OutVector& outVector, +void SpMVCSRAdaptiveGlobal( const Real *inVector, + Real *outVector, const Index* rowPointers, const Index* columnIndexes, const Real* values, int *blocks, size_t blocks_size, - Index getColumns) + Index getColumns, + size_t gridID) { /* Configuration ---------------------------------------------------*/ constexpr size_t SHARED = 49152/sizeof(Real); // number of elements in shared memory for block constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; //-------------------------------------------------------------------- - const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const size_t laneID = index % warpSize; const size_t blockIdx = index / warpSize; __shared__ Real shared_res[SHARED]; @@ -966,7 +965,7 @@ void SpMVCSRAdaptiveGlobal( const InVector& inVector, result += shared_res[i + offset]; } outVector[row] = result; // Write result - } + } else if (elements <= MAX_PER_WARP) { /////////////////////////////////////* CSR VECTOR *////////////// for (size_t i = minID + laneID; i < maxID; i += warpSize) { @@ -997,7 +996,7 @@ void SpMVCSRAdaptiveGlobal( const InVector& inVector, /* Execute a lot of CSR Vector */ if (laneID == 0) { - spmvCSRVectorHelper <<>>( + spmvCSRVectorHelper <<>>( inVector, columnIndexes, values, @@ -1069,75 +1068,424 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& } template< typename Real, - typename Device, typename Index, - CSRKernel KernelType > - template< typename InVector, - typename OutVector, - int warpSize > -__device__ -void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& inVector, - OutVector& outVector, - int gridIdx ) const + int warpSize > +__global__ +void SpMVCSRScalar( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index getColumns, + const size_t gridID) { - switch( KernelType ) - { - case CSRScalar: - // TODO: - /* FIXME */ - spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); - break; - case CSRVector: - spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); - break; - case CSRLight: - spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); - break; - case CSRAdaptive: - // spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); - /* FIXME */ - spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); + const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= rows) + return; + + Real result = 0.0; + const size_t startID = rowPointers[index]; + const size_t endID = rowPointers[index + 1]; + + for (size_t i = startID; i < endID; ++i) { + const size_t column = columnIndexes[i]; + if (column >= getColumns) break; - case CSRStream: - // TODO: - /* FIXME */ - spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); + + result += values[i] * inVector[column]; + } + + outVector[index] = result; +} + +template< typename Real, + typename Index, + int warpSize > +__global__ +void SpMVCSRMultiVector( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index getColumns, + const int perWarp, + const int offset, + const int gridID) +{ + const int index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const int laneID = index % warpSize; + const int rowID = index / offset; + if (rowID >= rows) + return; + const int inRowID = index % offset; + + Real result = 0.0; + // size_t startID = rowPointers[rowID] + inRowID; + int endID = rowPointers[rowID + 1]; + + /* Calculate result */ + for (int i = rowPointers[rowID] + inRowID; i < endID; i += offset) { + // size_t column = columnIndexes[i]; + if (columnIndexes[i] >= getColumns) break; + + result += values[i] * inVector[columnIndexes[i]]; } + /* Reduction */ + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); + /* Write result */ + if (laneID == 0) atomicAdd(&outVector[rowID], result); +} - /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const IndexType warpStart = warpSize * ( globalIdx / warpSize ); - const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); - const IndexType inWarpIdx = globalIdx % warpSize; +template< typename Real, + typename Index, + int warpSize > +__global__ +void SpMVCSRVector( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index getColumns, + const size_t gridID) +{ + const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const size_t warpID = index / warpSize; + const size_t laneID = index % warpSize; + if (warpID >= rows) + return; + + Real result = 0.0; + size_t startID = rowPointers[warpID] + laneID; + size_t endID = rowPointers[warpID + 1]; - if( this->getCudaKernelType() == vector ) + /* Calculate result */ + for (size_t i = startID; i < endID; i += warpSize) { + size_t column = columnIndexes[i]; + if (column >= getColumns) + break; + result += values[i] * inVector[column]; + } - ///// - // Hybrid mode - // - const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x; - const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x ); - const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) / - ( lastRow - firstRow ); + /* Reduction */ + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); + /* Write result */ + if (laneID == 0) outVector[warpID] = result; +} - if( nonzerosPerRow < this->getHybridModeSplit() ) - { - ///// - // Use the scalar mode - // - if( globalIdx < this->getRows() ) - outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector ); +template< typename Real, + typename Index, + int warpSize > +__global__ +void SpMVCSRLight( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index getColumns, + const size_t groupSize, + const size_t gridID) { + const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const size_t laneID = index % warpSize; + const size_t groupID = laneID / groupSize; + const size_t inGroupID = laneID % groupSize; + + size_t row, minID, column, maxID, idxMtx; + __shared__ unsigned rowCnt; + + if (index == 0) rowCnt = 0; // Init shared variable + __syncthreads(); + + while (true) { + + /* Get row number */ + if (inGroupID == 0) row = atomicAdd(&rowCnt, 1); + + /* Propagate row number in group */ + row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * groupSize); + if (row >= rows) + return; + + minID = rowPointers[row]; + maxID = rowPointers[row + 1]; + + Real result = 0.0; + + idxMtx = minID + inGroupID; + while (idxMtx < maxID) { + column = columnIndexes[idxMtx]; + if (column >= getColumns) + break; + + result += values[idxMtx] * inVector[column]; + idxMtx += groupSize; + } + + /* Parallel reduction */ + for (size_t i = groupSize / 2; i > 0; i /= 2) + result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); + /* Write result */ + if (inGroupID == 0) + outVector[row] = result; } +} + + +template< typename Real, + typename Index, + int warpSize > +void SpMVCSRScalarPrepare( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index getColumns) { + const size_t threads = 64; + size_t neededThreads = rows; + size_t blocks; + + for (size_t grid = 0; neededThreads != 0; ++grid) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + SpMVCSRScalar<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + rows, + getColumns, + grid + ); + } +} + +template< typename Real, + typename Index, + int warpSize > +void SpMVCSRVectorPrepare( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index getColumns) { + const size_t threads = 64; + size_t neededThreads = rows * warpSize; + size_t blocks; + + for (size_t grid = 0; neededThreads != 0; ++grid) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + SpMVCSRVector<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + rows, + getColumns, + grid + ); + } +} + +template< typename Real, + typename Index, + int warpSize > +void SpMVCSRLightPrepare( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const size_t valuesSize, + const Index rows, + const Index getColumns) { + const size_t threads = 64; + size_t neededThreads = rows * warpSize; + size_t blocks, groupSize; + + const size_t nnz = roundUpDivision(valuesSize, rows); // non zeroes per row + if (nnz <= 2) + groupSize = 2; + else if (nnz <= 4) + groupSize = 4; + else if (nnz <= 8) + groupSize = 8; + else if (nnz <= 16) + groupSize = 16; else - { - //// - // Use the vector mode - // - spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); - }*/ + groupSize = 32; + + neededThreads = groupSize * rows; + + for (size_t grid = 0; neededThreads != 0; ++grid) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + SpMVCSRLight<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + rows, + getColumns, + groupSize, + grid + ); + } +} + +template< typename Real, + typename Index, + int warpSize > +void SpMVCSRMultiVectorPrepare( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const size_t valuesSize, + const Index rows, + const Index getColumns) { + const size_t threads = 64; + size_t blocks; + + const size_t nnz = roundUpDivision(valuesSize, rows); // non zeroes per row + const size_t neededWarps = roundUpDivision(nnz, ELEMENTS_PER_WARP); + const size_t offset = neededWarps * ELEMENTS_PER_WARP; + size_t neededThreads = offset * rows; + for (size_t grid = 0; neededThreads != 0; ++grid) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + SpMVCSRMultiVector<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + rows, + getColumns, + ELEMENTS_PER_WARP, + offset, + grid + ); + } } + +template< typename Real, + typename Index, + typename Device, + CSRKernel KernelType, + int warpSize > +void SpMVCSRAdaptivePrepare( const Real *inVector, + Real* outVector, + const CSR< Real, Device, Index, KernelType >& matrix, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index getColumns) { + /* Configuration ---------------------------------------------------*/ + constexpr size_t SHARED = 49152/sizeof(Real); + constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; + //-------------------------------------------------------------------- + size_t blocks; + const size_t threads = THREADS_PER_BLOCK; + std::vector inBlock; + inBlock.push_back(0); + size_t sum = 0; + int i, prev_i = 0; + + for (i = 1; i < rows - 1; ++i) { + size_t elements = matrix.getRowPointers().getElement(i) - + matrix.getRowPointers().getElement(i - 1); + sum += elements; + if (sum > SHARED_PER_WARP) { + if (i - prev_i == 1) { + inBlock.push_back(i); + } else { + inBlock.push_back(i - 1); + --i; + } + sum = 0; + prev_i = i; + continue; + } + if (i - prev_i == 32) { + inBlock.push_back(i); + prev_i = i; + sum = 0; + } + } + inBlock.push_back(rows); + + /* blocks to GPU */ + int *blocksAdaptive; + cudaMalloc((void **)&blocksAdaptive, sizeof(int) * inBlock.size()); + cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(int), cudaMemcpyHostToDevice); + + size_t neededThreads = inBlock.size() * 32; + for (size_t grid = 0; neededThreads != 0; ++i) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + SpMVCSRAdaptiveGlobal<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + blocksAdaptive, + inBlock.size(), + getColumns, + grid + ); + } +} + #endif template<> @@ -1281,7 +1629,6 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& #endif } - #ifdef HAVE_CUSPARSE template<> class tnlCusparseCSRWrapper< float, int > @@ -1400,100 +1747,124 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - // if (KernelType == CSRAdaptive) { - /* Configuration ---------------------------------------------------*/ - constexpr size_t SHARED = 49152/sizeof(Real); - constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; - //-------------------------------------------------------------------- - /* Fill in blocks */ - std::vector inBlock; - inBlock.push_back(0); - size_t sum = 0; - int i; - int prev_i = 0; - for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { - size_t elements = matrix.getRowPointers().getElement(i) - - matrix.getRowPointers().getElement(i - 1); - sum += elements; - if (sum > SHARED_PER_WARP) { - if (i - prev_i == 1) { - inBlock.push_back(i); - } else { - inBlock.push_back(i - 1); - --i; - } - sum = 0; - prev_i = i; - continue; - } - if (i - prev_i == 32) { - inBlock.push_back(i); - prev_i = i; - sum = 0; - } - } - inBlock.push_back(matrix.getRowPointers().getSize() - 1); - /* Copy memory to GPU */ - const InVector *kernelInVector = Cuda::passToDevice( inVector ); - OutVector *kernelOutVector = Cuda::passToDevice( outVector ); - - /* blocks to GPU */ - int *kernelBlocks; - cudaMalloc((void **)&kernelBlocks, sizeof(int) * inBlock.size()); - cudaMemcpy(kernelBlocks, inBlock.data(), inBlock.size() * sizeof(int), cudaMemcpyHostToDevice); - - /* values to GPU */ - Real *kernelValues; - cudaMalloc((void **)&kernelValues, sizeof(Real) * matrix.getValues().getSize()); - cudaMemcpy(kernelValues, - (Real *)matrix.getValues().getData(), - matrix.getValues().getSize() * sizeof(Real), - cudaMemcpyHostToDevice); - - /* columns to GPU */ - Index *kernelColumns; - cudaMalloc((void **)&kernelColumns, sizeof(Index) * matrix.getColumnIndexes().getSize()); - cudaMemcpy(kernelColumns, - (Index *)matrix.getColumnIndexes().getData(), - matrix.getColumnIndexes().getSize() * sizeof(Index), - cudaMemcpyHostToDevice); - - /* row pointers to GPU */ - Index *kernelRowPointers; - cudaMalloc((void **)&kernelRowPointers, sizeof(Index) * matrix.getRowPointers().getSize()); - cudaMemcpy(kernelRowPointers, - (Index *)matrix.getRowPointers().getData(), - matrix.getRowPointers().getSize() * sizeof(Index), - cudaMemcpyHostToDevice); - - size_t needed_threads = 32 * (inBlock.size() - 1); // number of threads we need - size_t blocks = needed_threads / THREADS_PER_BLOCK; // warp per block - blocks = needed_threads % THREADS_PER_BLOCK ? blocks + 1 : blocks; - SpMVCSRAdaptiveGlobal< Real, Index, InVector, OutVector, 32 ><<>>( - *kernelInVector, - *kernelOutVector, - kernelRowPointers, - kernelColumns, - kernelValues, - kernelBlocks, - inBlock.size(), - matrix.getColumns() - ); - - /* Free memory */ - Cuda::freeFromDevice( kernelInVector ); - Cuda::freeFromDevice( kernelOutVector ); - cudaFree(kernelBlocks); - cudaFree(kernelValues); - cudaFree(kernelColumns); - cudaFree(kernelRowPointers); - - // } else + /* in vector to GPU */ + Real *kernelInVector; + cudaMalloc((void **)&kernelInVector, sizeof(Real) * inVector.getSize()); + cudaMemcpy(kernelInVector, + (Real *)inVector.getData(), + inVector.getSize() * sizeof(Real), + cudaMemcpyHostToDevice); + + /* out vector to GPU */ + Real *kernelOutVector; + cudaMalloc((void **)&kernelOutVector, sizeof(Real) * outVector.getSize()); + cudaMemcpy(kernelOutVector, + (Real *)outVector.getData(), + outVector.getSize() * sizeof(Real), + cudaMemcpyHostToDevice); + + /* values to GPU */ + Real *kernelValues; + cudaMalloc((void **)&kernelValues, sizeof(Real) * matrix.getValues().getSize()); + cudaMemcpy(kernelValues, + (Real *)matrix.getValues().getData(), + matrix.getValues().getSize() * sizeof(Real), + cudaMemcpyHostToDevice); + + /* columns to GPU */ + Index *kernelColumns; + cudaMalloc((void **)&kernelColumns, sizeof(Index) * matrix.getColumnIndexes().getSize()); + cudaMemcpy(kernelColumns, + (Index *)matrix.getColumnIndexes().getData(), + matrix.getColumnIndexes().getSize() * sizeof(Index), + cudaMemcpyHostToDevice); + + /* row pointers to GPU */ + Index *kernelRowPointers; + cudaMalloc((void **)&kernelRowPointers, sizeof(Index) * matrix.getRowPointers().getSize()); + cudaMemcpy(kernelRowPointers, + (Index *)matrix.getRowPointers().getData(), + matrix.getRowPointers().getSize() * sizeof(Index), + cudaMemcpyHostToDevice); + + switch(KernelType) + { + case CSRScalar: + SpMVCSRScalarPrepare( + kernelInVector, + kernelOutVector, + kernelRowPointers, + kernelColumns, + kernelValues, + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRVector: + SpMVCSRVectorPrepare( + kernelInVector, + kernelOutVector, + kernelRowPointers, + kernelColumns, + kernelValues, + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRLight: + SpMVCSRLightPrepare( + kernelInVector, + kernelOutVector, + kernelRowPointers, + kernelColumns, + kernelValues, + matrix.getValues().getSize(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRAdaptive: + SpMVCSRAdaptivePrepare( + kernelInVector, + kernelOutVector, + matrix, + kernelRowPointers, + kernelColumns, + kernelValues, + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRMultiVector: + SpMVCSRMultiVectorPrepare( + kernelInVector, + kernelOutVector, + kernelRowPointers, + kernelColumns, + kernelValues, + matrix.getValues().getSize(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + } + + /* Copy results */ + cudaMemcpy(outVector.getData(), + kernelOutVector, + outVector.getSize() * sizeof(Real), + cudaMemcpyDeviceToHost); + + /* Free memory */ + cudaFree(kernelInVector); + cudaFree(kernelOutVector); + cudaFree(kernelValues); + cudaFree(kernelColumns); + cudaFree(kernelRowPointers); + #endif /* HAVE_CUDA */ - // CSRVectorProductCuda( matrix, inVector, outVector); #endif } - }; } //namespace Legacy -- GitLab From f2833fcaf208f91915ae81eb15fb51e8fa6ded32 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sun, 21 Jun 2020 00:06:27 +0200 Subject: [PATCH 09/57] Added CSRLightWithoutAtomic, small optimizations --- src/TNL/Matrices/Legacy/CSR.h | 21 +- src/TNL/Matrices/Legacy/CSR_impl.h | 497 ++++++++++++++++------------- 2 files changed, 273 insertions(+), 245 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 25ddca7cf..49ae6da11 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -31,7 +31,8 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; -enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream, CSRMultiVector }; +enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, + CSRAdaptive, CSRMultiVector, CSRLightWithoutAtomic }; template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar > class CSR : public Sparse< Real, Device, Index > @@ -226,24 +227,6 @@ public: void spmvCudaVectorized( const InVector& inVector, OutVector& outVector, const IndexType gridIdx ) const; - - template< typename InVector, - typename OutVector, - int warpSize > - __device__ - void spmvCudaLightSpmv( const InVector& inVector, - OutVector& outVector, - int gridIdx) const; - - template< typename InVector, - typename OutVector, - int warpSize > - __device__ - void spmvCSRAdaptive( const InVector& inVector, - OutVector& outVector, - int gridIdx, - int *blocks, - size_t blocks_size) const; #endif // The following getters allow us to interface TNL with external C-like diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 6af6f8565..d49e526b8 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -30,11 +30,8 @@ constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; constexpr size_t MAX_X_DIM = 2147483647; constexpr size_t MAX_GRID_SIZE = MAX_X_DIM * THREADS_PER_BLOCK; /* CSR DYNAMIC VECTOR */ -constexpr size_t MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic -constexpr size_t ELEMENTS_PER_WARP = 1024; // how many elements should process new warp - -/* CSR Light SPMV */ -constexpr size_t THREADS_PER_ROW = 4; // how many elements should process new warp +constexpr int MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic +constexpr int ELEMENTS_PER_WARP = 1024; // how many elements should process new warp //------------------------------------- namespace TNL { @@ -742,54 +739,41 @@ template< typename Real, typename OutVector, int warpSize > __device__ -void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& inVector, - OutVector& outVector, - int gridIdx) const +void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector, + OutVector& outVector, + const IndexType gridIdx ) const { - const IndexType index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const IndexType laneID = index % warpSize; - const IndexType groupID = laneID / THREADS_PER_ROW; - const IndexType inGroupID = laneID % THREADS_PER_ROW; - - IndexType row, minID, column, maxID, idxMtx; - __shared__ unsigned rowCnt; - - if (index == 0) rowCnt = 0; // Init shared variable - __syncthreads(); - - while (true) { - - /* Get row number */ - if (inGroupID == 0) row = atomicAdd(&rowCnt, 1); - - /* Propagate row number in group */ - row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * THREADS_PER_ROW); - - if (row >= this->rowPointers.getSize() - 1) - return; - - minID = this->rowPointers[row]; - maxID = this->rowPointers[row + 1]; - - Real result = 0.0; - - idxMtx = minID + inGroupID; - while (idxMtx < maxID) { - column = this->columnIndexes[idxMtx]; - if (column >= this->getColumns()) - break; + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const IndexType warpStart = warpSize * ( globalIdx / warpSize ); + const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); + const IndexType inWarpIdx = globalIdx % warpSize; - result += this->values[idxMtx] * inVector[column]; - idxMtx += THREADS_PER_ROW; - } + volatile Real* aux = Cuda::getSharedMemory< Real >(); + for( IndexType row = warpStart; row < warpEnd; row++ ) + { + aux[ threadIdx.x ] = 0.0; - /* Parallel reduction */ - for (int i = THREADS_PER_ROW / 2; i > 0; i /= 2) - result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); - /* Write result */ - if (inGroupID == 0) { - outVector[row] = result; + IndexType elementPtr = this->rowPointers[ row ] + inWarpIdx; + const IndexType rowEnd = this->rowPointers[ row + 1 ]; + IndexType column; + while( elementPtr < rowEnd && + ( column = this->columnIndexes[ elementPtr ] ) < this->getColumns() ) + { + aux[ threadIdx.x ] += inVector[ column ] * this->values[ elementPtr ]; + elementPtr += warpSize; } + if( warpSize == 32 ) + if( inWarpIdx < 16 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 16 ]; + if( warpSize >= 16 ) + if( inWarpIdx < 8 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 8 ]; + if( warpSize >= 8 ) + if( inWarpIdx < 4 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 4 ]; + if( warpSize >= 4 ) + if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ]; + if( warpSize >= 2 ) + if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ]; + if( inWarpIdx == 0 ) + outVector[ row ] = aux[ threadIdx.x ]; } } @@ -889,26 +873,26 @@ void spmvCSRVectorHelper(const Real *inVector, const Real *values, const Index getColumns, Real *out, - size_t from, - size_t to, - size_t perWarp) + const Index from, + const Index to, + const Index perWarp) { - const size_t index = blockIdx.x * blockDim.x + threadIdx.x; - const size_t warpID = index / warpSize; - const size_t laneID = index % warpSize; - const size_t minID = from + warpID * perWarp; - size_t maxID = from + (warpID + 1) * perWarp; + const Index index = blockIdx.x * blockDim.x + threadIdx.x; + const Index warpID = index / warpSize; + const Index minID = from + warpID * perWarp; + Index maxID = from + (warpID + 1) * perWarp; if (minID >= to) return; if (maxID >= to ) maxID = to; - - Real result = 0; - for (size_t i = minID + laneID; i < maxID; i += warpSize) { - const size_t column = columnIndexes[i]; - if (column >= getColumns) + + const Index laneID = index % warpSize; + + Real result = 0.0; + for (Index i = minID + laneID; i < maxID; i += warpSize) { + if (columnIndexes[i] >= getColumns) break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } - + atomicAdd(out, result); } @@ -921,59 +905,59 @@ void SpMVCSRAdaptiveGlobal( const Real *inVector, const Index* rowPointers, const Index* columnIndexes, const Real* values, - int *blocks, - size_t blocks_size, + Index *blocks, + Index blocks_size, Index getColumns, - size_t gridID) + Index gridID) { /* Configuration ---------------------------------------------------*/ - constexpr size_t SHARED = 49152/sizeof(Real); // number of elements in shared memory for block - constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; + constexpr Index SHARED = 49152/sizeof(Real); // number of elements in shared memory for block + constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; //-------------------------------------------------------------------- - const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const size_t laneID = index % warpSize; - const size_t blockIdx = index / warpSize; __shared__ Real shared_res[SHARED]; + const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index blockIdx = index / warpSize; Real result = 0; - if (blockIdx >= blocks_size - 1) + if (blockIdx >= blocks_size) return; - const size_t minRow = blocks[blockIdx]; - const size_t maxRow = blocks[blockIdx + 1]; - const size_t minID = rowPointers[minRow]; - const size_t maxID = rowPointers[maxRow]; - const size_t elements = maxID - minID; + + const Index laneID = index % warpSize; + const Index minRow = blocks[blockIdx]; + const Index maxRow = blocks[blockIdx + 1]; + const Index minID = rowPointers[minRow]; + const Index maxID = rowPointers[maxRow]; + const Index elements = maxID - minID; + Index i; /* rows per block more than 1 */ if ((maxRow - minRow) > 1) { /////////////////////////////////////* CSR STREAM *////////////// /* Copy and calculate elements from global to shared memory, coalesced */ - const size_t offset = threadIdx.x / warpSize * SHARED_PER_WARP; - for (size_t i = laneID; i < elements; i += warpSize) { - const size_t elementIdx = i + minID; - const size_t column = columnIndexes[elementIdx]; - if (column >= getColumns) + const Index offset = threadIdx.x / warpSize * SHARED_PER_WARP; + for (i = laneID; i < elements; i += warpSize) { + const Index elementIdx = i + minID; + if (columnIndexes[elementIdx] >= getColumns) continue; - - shared_res[i + offset] = values[elementIdx] * inVector[column]; + + shared_res[i + offset] = values[elementIdx] * inVector[columnIndexes[elementIdx]]; } - const size_t row = minRow + laneID; + const Index row = minRow + laneID; if (row >= maxRow) return; /* Calculate result */ - const size_t to = rowPointers[row + 1] - minID; - for (size_t i = rowPointers[row] - minID; i < to; ++i) { + const Index to = rowPointers[row + 1] - minID; + for (i = rowPointers[row] - minID; i < to; ++i) { result += shared_res[i + offset]; } outVector[row] = result; // Write result } else if (elements <= MAX_PER_WARP) { /////////////////////////////////////* CSR VECTOR *////////////// - for (size_t i = minID + laneID; i < maxID; i += warpSize) { - size_t column = columnIndexes[i]; - if (column >= getColumns) + for (i = minID + laneID; i < maxID; i += warpSize) { + if (columnIndexes[i] >= getColumns) break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } /* Reduction */ result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); @@ -985,14 +969,11 @@ void SpMVCSRAdaptiveGlobal( const Real *inVector, } else { // too long row /////////////////////////////////////* CSR DYNAMIC VECTOR *////////////// - + /* Number of warps we need. This warp can be used to calculate result too, -1 warp */ - size_t warps = elements / ELEMENTS_PER_WARP; - warps = elements % ELEMENTS_PER_WARP ? warps : warps - 1; - - size_t blocks = warps / WARPS_PER_BLOCK; - blocks = warps % WARPS_PER_BLOCK ? blocks + 1 : blocks; + const Index warps = roundUpDivision(elements, ELEMENTS_PER_WARP) - 1; + const Index blocks = roundUpDivision(warps, WARPS_PER_BLOCK); /* Execute a lot of CSR Vector */ if (laneID == 0) { @@ -1007,66 +988,19 @@ void SpMVCSRAdaptiveGlobal( const Real *inVector, ELEMENTS_PER_WARP ); } + /* CSR Vector */ - for (size_t i = minID + laneID; i < minID + ELEMENTS_PER_WARP; i += warpSize) { - size_t column = columnIndexes[i]; - if (column >= getColumns) + for (i = minID + laneID; i < minID + ELEMENTS_PER_WARP; i += warpSize) { + if (columnIndexes[i] >= getColumns) break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } /* Write result */ atomicAdd(&outVector[minRow], result); } } - -template< typename Real, - typename Device, - typename Index, - CSRKernel KernelType > - template< typename InVector, - typename OutVector, - int warpSize > -__device__ -void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector, - OutVector& outVector, - const IndexType gridIdx ) const -{ - IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const IndexType warpStart = warpSize * ( globalIdx / warpSize ); - const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); - const IndexType inWarpIdx = globalIdx % warpSize; - - volatile Real* aux = Cuda::getSharedMemory< Real >(); - for( IndexType row = warpStart; row < warpEnd; row++ ) - { - aux[ threadIdx.x ] = 0.0; - - IndexType elementPtr = this->rowPointers[ row ] + inWarpIdx; - const IndexType rowEnd = this->rowPointers[ row + 1 ]; - IndexType column; - while( elementPtr < rowEnd && - ( column = this->columnIndexes[ elementPtr ] ) < this->getColumns() ) - { - aux[ threadIdx.x ] += inVector[ column ] * this->values[ elementPtr ]; - elementPtr += warpSize; - } - if( warpSize == 32 ) - if( inWarpIdx < 16 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 16 ]; - if( warpSize >= 16 ) - if( inWarpIdx < 8 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 8 ]; - if( warpSize >= 8 ) - if( inWarpIdx < 4 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 4 ]; - if( warpSize >= 4 ) - if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ]; - if( warpSize >= 2 ) - if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ]; - if( inWarpIdx == 0 ) - outVector[ row ] = aux[ threadIdx.x ]; - } -} - template< typename Real, typename Index, int warpSize > @@ -1078,22 +1012,20 @@ void SpMVCSRScalar( const Real *inVector, const Real* values, const Index rows, const Index getColumns, - const size_t gridID) + const Index gridID) { - const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; if (index >= rows) return; Real result = 0.0; - const size_t startID = rowPointers[index]; - const size_t endID = rowPointers[index + 1]; + const Index endID = rowPointers[index + 1]; - for (size_t i = startID; i < endID; ++i) { - const size_t column = columnIndexes[i]; - if (column >= getColumns) + for (Index i = rowPointers[index]; i < endID; ++i) { + if (columnIndexes[i] >= getColumns) break; - - result += values[i] * inVector[column]; + + result += values[i] * inVector[columnIndexes[i]]; } outVector[index] = result; @@ -1110,24 +1042,22 @@ void SpMVCSRMultiVector( const Real *inVector, const Real* values, const Index rows, const Index getColumns, - const int perWarp, - const int offset, - const int gridID) + const Index perWarp, + const Index offset, + const Index gridID) { - const int index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const int laneID = index % warpSize; - const int rowID = index / offset; + const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index rowID = index / offset; if (rowID >= rows) return; - const int inRowID = index % offset; + + const Index inRowID = index % offset; Real result = 0.0; - // size_t startID = rowPointers[rowID] + inRowID; - int endID = rowPointers[rowID + 1]; + Index endID = rowPointers[rowID + 1]; /* Calculate result */ - for (int i = rowPointers[rowID] + inRowID; i < endID; i += offset) { - // size_t column = columnIndexes[i]; + for (Index i = rowPointers[rowID] + inRowID; i < endID; i += offset) { if (columnIndexes[i] >= getColumns) break; @@ -1141,7 +1071,7 @@ void SpMVCSRMultiVector( const Real *inVector, result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); /* Write result */ - if (laneID == 0) atomicAdd(&outVector[rowID], result); + if (index % warpSize == 0) atomicAdd(&outVector[rowID], result); } template< typename Real, @@ -1155,25 +1085,23 @@ void SpMVCSRVector( const Real *inVector, const Real* values, const Index rows, const Index getColumns, - const size_t gridID) + const Index gridID) { - const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const size_t warpID = index / warpSize; - const size_t laneID = index % warpSize; + const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index warpID = index / warpSize; if (warpID >= rows) return; + const Index laneID = index % warpSize; Real result = 0.0; - size_t startID = rowPointers[warpID] + laneID; - size_t endID = rowPointers[warpID + 1]; + Index endID = rowPointers[warpID + 1]; /* Calculate result */ - for (size_t i = startID; i < endID; i += warpSize) { - size_t column = columnIndexes[i]; - if (column >= getColumns) + for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) { + if (columnIndexes[i] >= getColumns) break; - - result += values[i] * inVector[column]; + + result += values[i] * inVector[columnIndexes[i]]; } /* Reduction */ @@ -1197,14 +1125,14 @@ void SpMVCSRLight( const Real *inVector, const Real* values, const Index rows, const Index getColumns, - const size_t groupSize, - const size_t gridID) { - const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const size_t laneID = index % warpSize; - const size_t groupID = laneID / groupSize; - const size_t inGroupID = laneID % groupSize; - - size_t row, minID, column, maxID, idxMtx; + const Index groupSize, + const Index gridID) { + const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index laneID = index % warpSize; + const Index groupID = laneID / groupSize; + const Index inGroupID = laneID % groupSize; + + Index row, minID, column, maxID, idxMtx; __shared__ unsigned rowCnt; if (index == 0) rowCnt = 0; // Init shared variable @@ -1236,7 +1164,7 @@ void SpMVCSRLight( const Real *inVector, } /* Parallel reduction */ - for (size_t i = groupSize / 2; i > 0; i /= 2) + for (Index i = groupSize / 2; i > 0; i /= 2) result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); /* Write result */ if (inGroupID == 0) @@ -1244,6 +1172,46 @@ void SpMVCSRLight( const Real *inVector, } } +template< typename Real, + typename Index, + int warpSize > +__global__ +void SpMVCSRLightWithoutAtomic( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index getColumns, + const Index groupSize, + const Index gridID) { + const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index row = index / groupSize; + Index i; + + if (row >= rows) + return; + + const Index inGroupID = index % groupSize; + const Index minID = rowPointers[row]; + const Index maxID = rowPointers[row + 1]; + + Real result = 0.0; + for (i = minID + inGroupID; i < maxID; i += groupSize) { + Index column = columnIndexes[i]; + if (column >= getColumns) + break; + + result += values[i] * inVector[column]; + } + + /* Parallel reduction */ + for (i = groupSize / 2; i > 0; i /= 2) + result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); + + /* Write result */ + if (inGroupID == 0) outVector[row] = result; +} template< typename Real, typename Index, @@ -1255,11 +1223,11 @@ void SpMVCSRScalarPrepare( const Real *inVector, const Real* values, const Index rows, const Index getColumns) { - const size_t threads = 64; + const Index threads = 64; size_t neededThreads = rows; - size_t blocks; + Index blocks; - for (size_t grid = 0; neededThreads != 0; ++grid) { + for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); neededThreads = 0; @@ -1291,11 +1259,11 @@ void SpMVCSRVectorPrepare( const Real *inVector, const Real* values, const Index rows, const Index getColumns) { - const size_t threads = 64; + const Index threads = 64; size_t neededThreads = rows * warpSize; - size_t blocks; + Index blocks; - for (size_t grid = 0; neededThreads != 0; ++grid) { + for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); neededThreads = 0; @@ -1325,14 +1293,14 @@ void SpMVCSRLightPrepare( const Real *inVector, const Index* rowPointers, const Index* columnIndexes, const Real* values, - const size_t valuesSize, + const Index valuesSize, const Index rows, const Index getColumns) { - const size_t threads = 64; + const Index threads = 64; size_t neededThreads = rows * warpSize; - size_t blocks, groupSize; + Index blocks, groupSize; - const size_t nnz = roundUpDivision(valuesSize, rows); // non zeroes per row + const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row if (nnz <= 2) groupSize = 2; else if (nnz <= 4) @@ -1346,7 +1314,7 @@ void SpMVCSRLightPrepare( const Real *inVector, neededThreads = groupSize * rows; - for (size_t grid = 0; neededThreads != 0; ++grid) { + for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); neededThreads = 0; @@ -1369,6 +1337,58 @@ void SpMVCSRLightPrepare( const Real *inVector, } } +template< typename Real, + typename Index, + int warpSize > +void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index valuesSize, + const Index rows, + const Index getColumns) { + const Index threads = 64; + size_t neededThreads = rows * warpSize; + Index blocks, groupSize; + + const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row + if (nnz <= 2) + groupSize = 2; + else if (nnz <= 4) + groupSize = 4; + else if (nnz <= 8) + groupSize = 8; + else if (nnz <= 16) + groupSize = 16; + else + groupSize = 32; + + neededThreads = groupSize * rows; + + for (Index grid = 0; neededThreads != 0; ++grid) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + SpMVCSRLightWithoutAtomic<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + rows, + getColumns, + groupSize, + grid + ); + } +} + template< typename Real, typename Index, int warpSize > @@ -1377,17 +1397,17 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, const Index* rowPointers, const Index* columnIndexes, const Real* values, - const size_t valuesSize, + const Index valuesSize, const Index rows, const Index getColumns) { - const size_t threads = 64; - size_t blocks; + const Index threads = 64; + Index blocks; - const size_t nnz = roundUpDivision(valuesSize, rows); // non zeroes per row - const size_t neededWarps = roundUpDivision(nnz, ELEMENTS_PER_WARP); - const size_t offset = neededWarps * ELEMENTS_PER_WARP; + const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row + const size_t neededWarps = roundUpDivision(nnz, ELEMENTS_PER_WARP); // warps per row + const Index offset = neededWarps * ELEMENTS_PER_WARP; size_t neededThreads = offset * rows; - for (size_t grid = 0; neededThreads != 0; ++grid) { + for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); neededThreads = 0; @@ -1396,7 +1416,8 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, neededThreads -= MAX_X_DIM * threads; } - SpMVCSRMultiVector<<>>( + if (neededWarps == 1) { // one warp per warp -> execute CSR Vector + SpMVCSRVector<<>>( inVector, outVector, rowPointers, @@ -1404,10 +1425,22 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, values, rows, getColumns, - ELEMENTS_PER_WARP, - offset, grid - ); + ); + } else { + SpMVCSRMultiVector<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + rows, + getColumns, + ELEMENTS_PER_WARP, + offset, + grid + ); + } } } @@ -1425,18 +1458,18 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, const Index rows, const Index getColumns) { /* Configuration ---------------------------------------------------*/ - constexpr size_t SHARED = 49152/sizeof(Real); - constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; + constexpr Index SHARED = 49152/sizeof(Real); + constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; //-------------------------------------------------------------------- - size_t blocks; - const size_t threads = THREADS_PER_BLOCK; - std::vector inBlock; + Index blocks; + const Index threads = THREADS_PER_BLOCK; + std::vector inBlock; inBlock.push_back(0); - size_t sum = 0; - int i, prev_i = 0; + Index sum = 0; + Index i, prev_i = 0; for (i = 1; i < rows - 1; ++i) { - size_t elements = matrix.getRowPointers().getElement(i) - + Index elements = matrix.getRowPointers().getElement(i) - matrix.getRowPointers().getElement(i - 1); sum += elements; if (sum > SHARED_PER_WARP) { @@ -1450,7 +1483,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, prev_i = i; continue; } - if (i - prev_i == 32) { + if (i - prev_i == warpSize) { inBlock.push_back(i); prev_i = i; sum = 0; @@ -1459,12 +1492,12 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, inBlock.push_back(rows); /* blocks to GPU */ - int *blocksAdaptive; - cudaMalloc((void **)&blocksAdaptive, sizeof(int) * inBlock.size()); - cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(int), cudaMemcpyHostToDevice); + Index *blocksAdaptive; + cudaMalloc((void **)&blocksAdaptive, sizeof(Index) * inBlock.size()); + cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(Index), cudaMemcpyHostToDevice); size_t neededThreads = inBlock.size() * 32; - for (size_t grid = 0; neededThreads != 0; ++i) { + for (Index grid = 0; neededThreads != 0; ++i) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); neededThreads = 0; @@ -1479,7 +1512,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, columnIndexes, values, blocksAdaptive, - inBlock.size(), + inBlock.size() - 1, // -1 here is better than -1 in kernel getColumns, grid ); @@ -1847,6 +1880,18 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getColumns() ); break; + case CSRLightWithoutAtomic: + SpMVCSRLightPrepare( + kernelInVector, + kernelOutVector, + kernelRowPointers, + kernelColumns, + kernelValues, + matrix.getValues().getSize(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; } /* Copy results */ -- GitLab From 6d63a8b60ecb77052f9020b65e11f9b3bb951efe Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sun, 21 Jun 2020 00:33:26 +0200 Subject: [PATCH 10/57] Fixed script and benchmark --- src/Benchmarks/SpMV/spmv-legacy.h | 14 +++-- .../scripts/tnl-spmv-benchmark-make-tables.py | 58 +++++++++---------- src/TNL/Matrices/MatrixInfo.h | 12 +++- 3 files changed, 46 insertions(+), 38 deletions(-) diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h index ff1cdacaf..30f702ae1 100644 --- a/src/Benchmarks/SpMV/spmv-legacy.h +++ b/src/Benchmarks/SpMV/spmv-legacy.h @@ -89,7 +89,10 @@ template< typename Real, typename Device, typename Index > using SparseMatrixLegacy_CSR_Adaptive = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRAdaptive >; template< typename Real, typename Device, typename Index > -using SparseMatrixLegacy_CSR_Stream = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRStream >; +using SparseMatrixLegacy_CSR_MultiVector = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRMultiVector >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_LightWithoutAtomic = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLightWithoutAtomic >; // Get the name (with extension) of input matrix file std::string getMatrixFileName( const String& InputFileName ) @@ -292,10 +295,11 @@ benchmarkSpmvSynthetic( Benchmark& benchmark, #endif benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Stream >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_MultiVector>( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic>( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py index c7e733d8e..639093df3 100755 --- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py +++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py @@ -62,15 +62,12 @@ df.sort_index(axis=1, inplace=True) df.drop(columns=('BiEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('BiEllpack', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('CSR', 'CPU','speedup'), axis=1, inplace=True ) - -#df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy LightWithoutAtomic', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Scalar', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Stream', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy MultiVector', 'CPU','speedup'), axis=1, inplace=True ) - +df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True ) +df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True ) +df.drop(columns=('CSR Legacy Scalar', 'CPU','speedup'), axis=1, inplace=True ) +df.drop(columns=('CSR Legacy LightWithoutAtomic', 'CPU','speedup'), axis=1, inplace=True ) +df.drop(columns=('CSR Legacy MultiVector', 'CPU','speedup'), axis=1, inplace=True ) +df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('ChunkedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('Ellpack', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('Ellpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) @@ -84,28 +81,27 @@ df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True print( "Computing speed-up of formats...") # Add speedup compared to CSR and cuSparse - -df["BiEllpack Legacy", "CPU", "CSR speedup"] = df["BiEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["BiEllpack Legacy", "GPU", "cuSparse speedup"] = df["BiEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["BiEllpack", "CPU", "CSR speedup"] = df["BiEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] -df["BiEllpacky", "GPU", "cuSparse speedup"] = df["BiEllpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR", "GPU", "cuSparse speedup"] = df["CSR", "GPU", "time"] / df["cuSparse", "GPU", "time"] -#df["CSR Legacy Adaptive", "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive", "GPU", "time"] / df["cuSparse", "GPU", "time"] -#df["CSR Legacy Light", "GPU", "cuSparse speedup"] = df["CSR Legacy Light", "GPU", "time"] / df["cuSparse", "GPU", "time"] -#df["CSR Legacy LightWithoutAtomic", "GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic", "GPU", "time"] / df["cuSparse", "GPU", "time"] -#df["CSR Legacy Scalar", "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar", "GPU", "time"] / df["cuSparse", "GPU", "time"] -#df["CSR Legacy Vector", "GPU", "cuSparse speedup"] = df["CSR Legacy Vector", "GPU", "time"] / df["cuSparse", "GPU", "time"] -#df["CSR Legacy MultiVector", "GPU", "cuSparse speedup"] = df["CSR Legacy MultiVector", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["ChunkedEllpack Legacy", "CPU", "CSR speedup"] = df["ChunkedEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["ChunkedEllpack Legacy", "GPU", "cuSparse speedup"] = df["ChunkedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["Ellpack Legacy", "CPU", "CSR speedup"] = df["Ellpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["Ellpack Legacy", "GPU", "cuSparse speedup"] = df["Ellpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["Ellpack", "CPU", "CSR speedup"] = df["Ellpack", "CPU", "time"] / df["CSR", "CPU", "time"] -df["Ellpack", "GPU", "cuSparse speedup"] = df["Ellpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["SlicedEllpack Legacy", "CPU", "CSR speedup"] = df["SlicedEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["SlicedEllpack Legacy", "GPU", "cuSparse speedup"] = df["SlicedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["SlicedEllpack", "CPU", "CSR speedup"] = df["SlicedEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] -df["SlicedEllpack", "GPU", "cuSparse speedup"] = df["SlicedEllpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["BiEllpack Legacy", "CPU", "CSR speedup"] = df["BiEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] +df["BiEllpack Legacy", "GPU", "cuSparse speedup"] = df["BiEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["BiEllpack", "CPU", "CSR speedup"] = df["BiEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] +df["BiEllpacky", "GPU", "cuSparse speedup"] = df["BiEllpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR", "GPU", "cuSparse speedup"] = df["CSR", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Adaptive", "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Light", "GPU", "cuSparse speedup"] = df["CSR Legacy Light", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Scalar", "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy LightWithoutAtomic","GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic","GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy MultiVector","GPU", "cuSparse speedup"] = df["CSR Legacy MultiVector","GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Vector", "GPU", "cuSparse speedup"] = df["CSR Legacy Vector", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["ChunkedEllpack Legacy", "CPU", "CSR speedup"] = df["ChunkedEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] +df["ChunkedEllpack Legacy", "GPU", "cuSparse speedup"] = df["ChunkedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["Ellpack Legacy", "CPU", "CSR speedup"] = df["Ellpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] +df["Ellpack Legacy", "GPU", "cuSparse speedup"] = df["Ellpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["Ellpack", "CPU", "CSR speedup"] = df["Ellpack", "CPU", "time"] / df["CSR", "CPU", "time"] +df["Ellpack", "GPU", "cuSparse speedup"] = df["Ellpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["SlicedEllpack Legacy", "CPU", "CSR speedup"] = df["SlicedEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] +df["SlicedEllpack Legacy", "GPU", "cuSparse speedup"] = df["SlicedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["SlicedEllpack", "CPU", "CSR speedup"] = df["SlicedEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] +df["SlicedEllpack", "GPU", "cuSparse speedup"] = df["SlicedEllpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] # Add speedup compared to legacy formats df["CSR", "GPU", "Legacy speedup"] = df["CSR", "GPU", "time"] / df["CSR Legacy Scalar", "GPU", "time"] diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h index 8e0870848..fa39bfdda 100644 --- a/src/TNL/Matrices/MatrixInfo.h +++ b/src/TNL/Matrices/MatrixInfo.h @@ -122,11 +122,19 @@ struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRAdaptive > > }; template< typename Real, typename Device, typename Index > -struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRStream > > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRMultiVector > > { static String getDensity() { return String( "sparse" ); }; - static String getFormat() { return "CSR Legacy Stream"; }; + static String getFormat() { return "CSR Legacy MultiVector"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLightWithoutAtomic > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy LightWithoutAtomic"; }; }; template< typename Real, typename Device, typename Index > -- GitLab From 8e5d5b1a869665130ff028ad59ac33e535b10008 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sun, 21 Jun 2020 15:46:27 +0200 Subject: [PATCH 11/57] Bug fixes --- src/TNL/Matrices/Legacy/CSR_impl.h | 60 ++++++++++++++++-------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index d49e526b8..b84e04d22 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -900,15 +900,15 @@ template< typename Real, typename Index, int warpSize > __global__ -void SpMVCSRAdaptiveGlobal( const Real *inVector, - Real *outVector, - const Index* rowPointers, - const Index* columnIndexes, - const Real* values, - Index *blocks, - Index blocks_size, - Index getColumns, - Index gridID) +void SpMVCSRAdaptive( const Real *inVector, + Real *outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + Index *blocks, + Index blocks_size, + Index getColumns, + Index gridID) { /* Configuration ---------------------------------------------------*/ constexpr Index SHARED = 49152/sizeof(Real); // number of elements in shared memory for block @@ -1126,22 +1126,19 @@ void SpMVCSRLight( const Real *inVector, const Index rows, const Index getColumns, const Index groupSize, - const Index gridID) { + const Index gridID, + unsigned *rowCnt) { const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index laneID = index % warpSize; const Index groupID = laneID / groupSize; const Index inGroupID = laneID % groupSize; - Index row, minID, column, maxID, idxMtx; - __shared__ unsigned rowCnt; - - if (index == 0) rowCnt = 0; // Init shared variable - __syncthreads(); + Index row, minID, maxID, i; while (true) { /* Get row number */ - if (inGroupID == 0) row = atomicAdd(&rowCnt, 1); + if (inGroupID == 0) row = atomicAdd(rowCnt, 1); /* Propagate row number in group */ row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * groupSize); @@ -1153,14 +1150,11 @@ void SpMVCSRLight( const Real *inVector, Real result = 0.0; - idxMtx = minID + inGroupID; - while (idxMtx < maxID) { - column = columnIndexes[idxMtx]; - if (column >= getColumns) + for (i = minID + inGroupID; i < maxID; i += groupSize) { + if (columnIndexes[i] >= getColumns) break; - result += values[idxMtx] * inVector[column]; - idxMtx += groupSize; + result += values[i] * inVector[columnIndexes[i]]; } /* Parallel reduction */ @@ -1198,11 +1192,10 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector, Real result = 0.0; for (i = minID + inGroupID; i < maxID; i += groupSize) { - Index column = columnIndexes[i]; - if (column >= getColumns) + if (columnIndexes[i] >= getColumns) break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } /* Parallel reduction */ @@ -1299,6 +1292,12 @@ void SpMVCSRLightPrepare( const Real *inVector, const Index threads = 64; size_t neededThreads = rows * warpSize; Index blocks, groupSize; + /* Copy rowCnt to GPU */ + unsigned rowCnt = 0; + unsigned *kernelRowCnt; + cudaMalloc((void **)&kernelRowCnt, sizeof(*kernelRowCnt)); + cudaMemcpy(kernelRowCnt, &rowCnt, sizeof(*kernelRowCnt), cudaMemcpyHostToDevice); + const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row if (nnz <= 2) @@ -1332,9 +1331,12 @@ void SpMVCSRLightPrepare( const Real *inVector, rows, getColumns, groupSize, - grid + grid, + kernelRowCnt ); } + + cudaFree(kernelRowCnt); } template< typename Real, @@ -1505,7 +1507,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, blocks = MAX_X_DIM; neededThreads -= MAX_X_DIM * threads; } - SpMVCSRAdaptiveGlobal<<>>( + SpMVCSRAdaptive<<>>( inVector, outVector, rowPointers, @@ -1517,6 +1519,8 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, grid ); } + + cudaFree(blocksAdaptive); } #endif @@ -1881,7 +1885,7 @@ class CSRDeviceDependentCode< Devices::Cuda > ); break; case CSRLightWithoutAtomic: - SpMVCSRLightPrepare( + SpMVCSRLightWithoutAtomicPrepare( kernelInVector, kernelOutVector, kernelRowPointers, -- GitLab From 3cbaf4573c4aa1a2b8286d9ec4d6cfba9e008040 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Mon, 22 Jun 2020 22:17:05 +0200 Subject: [PATCH 12/57] CSR Adaptive optimizations --- src/TNL/Matrices/Legacy/CSR_impl.h | 92 ++++++++++++++++-------------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index b84e04d22..c48bb7ced 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -23,16 +23,7 @@ #include #endif -/* CONFIGURATION */ -constexpr size_t WARP_SIZE = 32; -constexpr size_t THREADS_PER_BLOCK = 1024; -constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; constexpr size_t MAX_X_DIM = 2147483647; -constexpr size_t MAX_GRID_SIZE = MAX_X_DIM * THREADS_PER_BLOCK; -/* CSR DYNAMIC VECTOR */ -constexpr int MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic -constexpr int ELEMENTS_PER_WARP = 1024; // how many elements should process new warp -//------------------------------------- namespace TNL { namespace Matrices { @@ -880,8 +871,9 @@ void spmvCSRVectorHelper(const Real *inVector, const Index index = blockIdx.x * blockDim.x + threadIdx.x; const Index warpID = index / warpSize; const Index minID = from + warpID * perWarp; - Index maxID = from + (warpID + 1) * perWarp; if (minID >= to) return; + + Index maxID = from + (warpID + 1) * perWarp; if (maxID >= to ) maxID = to; const Index laneID = index % warpSize; @@ -908,13 +900,11 @@ void SpMVCSRAdaptive( const Real *inVector, Index *blocks, Index blocks_size, Index getColumns, - Index gridID) + Index gridID, + const Index sharedPerWarp, + const Index maxPerWarp) { - /* Configuration ---------------------------------------------------*/ - constexpr Index SHARED = 49152/sizeof(Real); // number of elements in shared memory for block - constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; - //-------------------------------------------------------------------- - __shared__ Real shared_res[SHARED]; + extern __shared__ Real shared_res[]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; Real result = 0; @@ -926,32 +916,36 @@ void SpMVCSRAdaptive( const Real *inVector, const Index maxRow = blocks[blockIdx + 1]; const Index minID = rowPointers[minRow]; const Index maxID = rowPointers[maxRow]; - const Index elements = maxID - minID; - Index i; + Index i, to; /* rows per block more than 1 */ if ((maxRow - minRow) > 1) { /////////////////////////////////////* CSR STREAM *////////////// /* Copy and calculate elements from global to shared memory, coalesced */ - const Index offset = threadIdx.x / warpSize * SHARED_PER_WARP; - for (i = laneID; i < elements; i += warpSize) { - const Index elementIdx = i + minID; - if (columnIndexes[elementIdx] >= getColumns) - continue; - - shared_res[i + offset] = values[elementIdx] * inVector[columnIndexes[elementIdx]]; + const Index offset = threadIdx.x / warpSize * sharedPerWarp; + Index elementID = laneID + minID; + Index sharedID = laneID + offset; // index for shared memory + for (; elementID < maxID; elementID += warpSize, sharedID += warpSize) { + if (columnIndexes[elementID] >= getColumns) + continue; // can't be break + shared_res[sharedID] = values[elementID] * inVector[columnIndexes[elementID]]; } const Index row = minRow + laneID; if (row >= maxRow) return; + /* Calculate result */ - const Index to = rowPointers[row + 1] - minID; - for (i = rowPointers[row] - minID; i < to; ++i) { - result += shared_res[i + offset]; - } + sharedID = rowPointers[row] - minID + offset; // start of preprocessed results in shared memory + to = rowPointers[row + 1] - minID + offset; // end of preprocessed data + for (; sharedID < to; ++sharedID) + result += shared_res[sharedID]; + outVector[row] = result; // Write result + return; } - else if (elements <= MAX_PER_WARP) { + + const Index elements = maxID - minID; + if (elements <= maxPerWarp) { /////////////////////////////////////* CSR VECTOR *////////////// for (i = minID + laneID; i < maxID; i += warpSize) { if (columnIndexes[i] >= getColumns) @@ -969,7 +963,9 @@ void SpMVCSRAdaptive( const Real *inVector, } else { // too long row /////////////////////////////////////* CSR DYNAMIC VECTOR *////////////// - + constexpr Index THREADS_PER_BLOCK = 1024; + constexpr Index ELEMENTS_PER_WARP = 1024; + constexpr Index WARPS_PER_BLOCK = ELEMENTS_PER_WARP / warpSize; /* Number of warps we need. This warp can be used to calculate result too, -1 warp */ const Index warps = roundUpDivision(elements, ELEMENTS_PER_WARP) - 1; @@ -990,7 +986,8 @@ void SpMVCSRAdaptive( const Real *inVector, } /* CSR Vector */ - for (i = minID + laneID; i < minID + ELEMENTS_PER_WARP; i += warpSize) { + to = minID + ELEMENTS_PER_WARP; + for (i = minID + laneID; i < to; i += warpSize) { if (columnIndexes[i] >= getColumns) break; @@ -1132,7 +1129,6 @@ void SpMVCSRLight( const Real *inVector, const Index laneID = index % warpSize; const Index groupID = laneID / groupSize; const Index inGroupID = laneID % groupSize; - Index row, minID, maxID, i; while (true) { @@ -1216,7 +1212,7 @@ void SpMVCSRScalarPrepare( const Real *inVector, const Real* values, const Index rows, const Index getColumns) { - const Index threads = 64; + const Index threads = 256; size_t neededThreads = rows; Index blocks; @@ -1252,7 +1248,7 @@ void SpMVCSRVectorPrepare( const Real *inVector, const Real* values, const Index rows, const Index getColumns) { - const Index threads = 64; + const Index threads = 256; size_t neededThreads = rows * warpSize; Index blocks; @@ -1289,8 +1285,7 @@ void SpMVCSRLightPrepare( const Real *inVector, const Index valuesSize, const Index rows, const Index getColumns) { - const Index threads = 64; - size_t neededThreads = rows * warpSize; + const Index threads = 256; Index blocks, groupSize; /* Copy rowCnt to GPU */ unsigned rowCnt = 0; @@ -1311,7 +1306,7 @@ void SpMVCSRLightPrepare( const Real *inVector, else groupSize = 32; - neededThreads = groupSize * rows; + size_t neededThreads = groupSize * rows; for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { @@ -1350,7 +1345,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, const Index valuesSize, const Index rows, const Index getColumns) { - const Index threads = 64; + const Index threads = 256; size_t neededThreads = rows * warpSize; Index blocks, groupSize; @@ -1402,7 +1397,10 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, const Index valuesSize, const Index rows, const Index getColumns) { - const Index threads = 64; + /* Configuration */ + constexpr int ELEMENTS_PER_WARP = 1024; // how many elements should process every warp + //---------------------------------------------------------------------------------- + const Index threads = 256; Index blocks; const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row @@ -1418,7 +1416,7 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, neededThreads -= MAX_X_DIM * threads; } - if (neededWarps == 1) { // one warp per warp -> execute CSR Vector + if (neededWarps == 1) { // one warp per row -> execute CSR Vector SpMVCSRVector<<>>( inVector, outVector, @@ -1460,9 +1458,12 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, const Index rows, const Index getColumns) { /* Configuration ---------------------------------------------------*/ + constexpr size_t THREADS_PER_BLOCK = 1024; + constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32; constexpr Index SHARED = 49152/sizeof(Real); constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; - //-------------------------------------------------------------------- + constexpr Index MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic + //-------------------------------------------------------------------- Index blocks; const Index threads = THREADS_PER_BLOCK; std::vector inBlock; @@ -1507,7 +1508,8 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, blocks = MAX_X_DIM; neededThreads -= MAX_X_DIM * threads; } - SpMVCSRAdaptive<<>>( + + SpMVCSRAdaptive<<>>( inVector, outVector, rowPointers, @@ -1516,7 +1518,9 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, blocksAdaptive, inBlock.size() - 1, // -1 here is better than -1 in kernel getColumns, - grid + grid, + SHARED_PER_WARP, + MAX_PER_WARP ); } -- GitLab From 229e10b8e2ddf05405d660bd4844cd2b5979aed5 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Mon, 22 Jun 2020 23:05:33 +0200 Subject: [PATCH 13/57] Temporary deleted using of shared memory --- src/TNL/Matrices/Legacy/CSR_impl.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index c48bb7ced..619c7928b 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -904,7 +904,9 @@ void SpMVCSRAdaptive( const Real *inVector, const Index sharedPerWarp, const Index maxPerWarp) { - extern __shared__ Real shared_res[]; + // extern __shared__ Real shared_res[]; + constexpr Index SHARED = 49152/sizeof(Real); + __shared__ Real shared_res[SHARED]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; Real result = 0; @@ -1509,7 +1511,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, neededThreads -= MAX_X_DIM * threads; } - SpMVCSRAdaptive<<>>( + SpMVCSRAdaptive<<>>( inVector, outVector, rowPointers, -- GitLab From 62ce765eab44ce9f55d494dc6dd5b23663ca0f05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 30 Jun 2020 14:06:19 +0200 Subject: [PATCH 14/57] Fixing includes of std headers in ConfigEntryType. --- src/TNL/Config/ConfigEntryType.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/TNL/Config/ConfigEntryType.h b/src/TNL/Config/ConfigEntryType.h index 28f57a582..4e6544639 100644 --- a/src/TNL/Config/ConfigEntryType.h +++ b/src/TNL/Config/ConfigEntryType.h @@ -12,6 +12,8 @@ #pragma once +#include +#include #include #include #include -- GitLab From cc5dd60068e62e679fe4ef5d6693db5f97f91092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 30 Jun 2020 22:16:45 +0200 Subject: [PATCH 15/57] Finishing rebase. --- src/TNL/Matrices/Legacy/CSR_impl.h | 198 ----------------------------- 1 file changed, 198 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 619c7928b..0cc3c312e 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -768,93 +768,6 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& } } -template< typename Real, - typename Device, - typename Index, - CSRKernel KernelType > - template< typename InVector, - typename OutVector, - int warpSize > -__device__ -void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& inVector, - OutVector& outVector, - int gridIdx, - int *blocks, - size_t blocks_size) const -{ - /* Configuration ---------------------------------------------------*/ - constexpr size_t SHARED = 49152/sizeof(Real); - constexpr size_t SHARED_PER_WARP = SHARED / warpSize; - constexpr size_t MAX_PER_WARP = 65536; - //constexpr size_t ELEMENTS_PER_WARP = 1024; - //constexpr size_t THREADS_PER_BLOCK = 1024; - //constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize; - //-------------------------------------------------------------------- - const size_t index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const size_t laneID = index % warpSize; - size_t blockIdx = index / warpSize; - __shared__ Real shared_res[SHARED]; - Real result = 0.0; - if (blockIdx >= blocks_size - 1) - return; - const size_t minRow = blocks[blockIdx]; - const size_t maxRow = blocks[blockIdx + 1]; - const size_t minID = this->rowPointers[minRow]; - const size_t maxID = this->rowPointers[maxRow]; - const size_t elements = maxID - minID; - /* rows per block more than 1 */ - if ((maxRow - minRow) > 1) { - /////////////////////////////////////* CSR STREAM *////////////// - /* Copy and calculate elements from global to shared memory, coalesced */ - const size_t offset = threadIdx.x / warpSize * SHARED_PER_WARP; - for (size_t i = laneID; i < elements; i += warpSize) { - const size_t elementIdx = i + minID; - const size_t column = this->columnIndexes[elementIdx]; - if (column >= this->getColumns()) - continue; - shared_res[i + offset] = this->values[elementIdx] * inVector[column]; - } - - const size_t row = minRow + laneID; - if (row >= maxRow) - return; - /* Calculate result */ - const size_t to = this->rowPointers[row + 1] - minID; - for (size_t i = this->rowPointers[row] - minID; i < to; ++i) { - result += shared_res[i + offset]; - } - outVector[row] = result; // Write result - } else { - /////////////////////////////////////* CSR VECTOR *////////////// - for (size_t i = minID + laneID; i < maxID; i += warpSize) { - size_t column = this->columnIndexes[i]; - if (column >= this->getColumns()) - break; - - result += this->values[i] * inVector[column]; - } - /* Reduction */ - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); - if (laneID == 0) outVector[minRow] = result; // Write result - } else { - /////////////////////////////////////* CSR VECTOR LONG *////////////// - //const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1; - //const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1; - //const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize; - // spmvCSRVectorHelper <<>>( - // inVector, - // &outVector[minRow], - // (size_t)(minID + ELEMENTS_PER_WARP), - // (size_t)maxID, - // (size_t)ELEMENTS_PER_WARP - // ); - } -} - template< typename Real, typename Index, int warpSize > @@ -1560,117 +1473,6 @@ class CSRDeviceDependentCode< Devices::Host > }; -#ifdef HAVE_CUDA - -template< typename Real, - typename Index, - CSRKernel KernelType, - typename InVector, - typename OutVector, - int warpSize > -__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix, - const InVector* inVector, - OutVector* outVector, - int gridIdx) -{ - typedef CSR< Real, Devices::Cuda, Index > Matrix; - static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); - const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( KernelType == CSRScalar ) - { - if( rowIdx < matrix->getRows() ) - ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); - } - else - { - matrix->template vectorProductCuda< InVector, OutVector, warpSize > - ( *inVector, *outVector, gridIdx ); - } -} -#endif - -template< typename Real, - typename Index, - CSRKernel KernelType, - typename InVector, - typename OutVector > -void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix, - const InVector& inVector, - OutVector& outVector) -{ -#ifdef HAVE_CUDA - typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - TNL_CHECK_CUDA_DEVICE; - dim3 cudaBlockSize( 256 ); - //dim3 cudaGridSize( Cuda::getMaxGridSize() ); - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - //if( gridIdx == cudaGrids - 1 ) - // cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - //const int sharedMemory = cudaBlockSize.x * sizeof( Real ); - //const int threads = cudaBlockSize.x; - if( matrix.getCudaWarpSize() == 32 ) { - // CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 > - // <<< 2, 1024 >>> - // ( kernel_this, - // kernel_inVector, - // kernel_outVector, - // gridIdx, kernelBlocks, size ); - CSRScalarGlobal< Real, Index, KernelType, InVector, OutVector, 32 > - <<< 2, 1024 >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx ); - if( matrix.getCudaWarpSize() == 16 ) - CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 16 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx); - if( matrix.getCudaWarpSize() == 8 ) - CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 8 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx); - if( matrix.getCudaWarpSize() == 4 ) - CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 4 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx); - if( matrix.getCudaWarpSize() == 2 ) - CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 2 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx); - if( matrix.getCudaWarpSize() == 1 ) - CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 1 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx); - } - TNL_CHECK_CUDA_DEVICE; - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_inVector ); - Cuda::freeFromDevice( kernel_outVector ); - TNL_CHECK_CUDA_DEVICE; -#endif -} #ifdef HAVE_CUSPARSE template<> -- GitLab From 1679b8d37291bb944c12b5bbec2b73bcfffbdedc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 1 Jul 2020 09:19:05 +0200 Subject: [PATCH 16/57] Fixed linking of libcudadevrt. --- src/Benchmarks/BLAS/CMakeLists.txt | 7 ++++--- src/Benchmarks/SpMV/CMakeLists.txt | 2 +- src/UnitTests/Matrices/Legacy/CMakeLists.txt | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt index 9017a14fb..9743b3eae 100644 --- a/src/Benchmarks/BLAS/CMakeLists.txt +++ b/src/Benchmarks/BLAS/CMakeLists.txt @@ -1,7 +1,8 @@ if( BUILD_CUDA ) - cuda_add_executable( tnl-benchmark-blas tnl-benchmark-blas.cu ) - cuda_add_cublas_to_target( tnl-benchmark-blas ) - target_link_libraries( tnl-benchmark-blas ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a ) + #find_library( CUDADEVRT NAMES cudadevrt ) + cuda_add_executable( tnl-benchmark-blas tnl-benchmark-blas.cu ) + cuda_add_cublas_to_target( tnl-benchmark-blas ) + #target_link_libraries( tnl-benchmark-blas ${CUDADEVRT} )#${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a ) else() add_executable( tnl-benchmark-blas tnl-benchmark-blas.cpp ) endif() diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt index 7357a3492..7adbd8ffd 100644 --- a/src/Benchmarks/SpMV/CMakeLists.txt +++ b/src/Benchmarks/SpMV/CMakeLists.txt @@ -1,6 +1,6 @@ if( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu ) - TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a ) + TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} -lcudadevrt ) else() ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp ) endif() diff --git a/src/UnitTests/Matrices/Legacy/CMakeLists.txt b/src/UnitTests/Matrices/Legacy/CMakeLists.txt index d47b07e19..004971c13 100644 --- a/src/UnitTests/Matrices/Legacy/CMakeLists.txt +++ b/src/UnitTests/Matrices/Legacy/CMakeLists.txt @@ -15,7 +15,7 @@ IF( BUILD_CUDA ) TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_ChunkedEllpack ${GTEST_BOTH_LIBRARIES} ) CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_CSR SparseMatrixTest_CSR.cu OPTIONS ${CXX_TESTS_FLAGS} ) - TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a ) + TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} -lcudadevrt ) CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_Ellpack SparseMatrixTest_Ellpack.cu OPTIONS ${CXX_TESTS_FLAGS} ) TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_Ellpack ${GTEST_BOTH_LIBRARIES} ) -- GitLab From d5d832a48e9948cb7062386bbe5bb5d01a36f2da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 1 Jul 2020 11:42:39 +0200 Subject: [PATCH 17/57] Fixed script for processing results of SpMV benchmark. --- .../scripts/tnl-spmv-benchmark-make-tables.py | 173 +++++++++--------- 1 file changed, 86 insertions(+), 87 deletions(-) diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py index 639093df3..a11a40a08 100755 --- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py +++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py @@ -62,12 +62,11 @@ df.sort_index(axis=1, inplace=True) df.drop(columns=('BiEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('BiEllpack', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('CSR', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('CSR Legacy Scalar', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('CSR Legacy LightWithoutAtomic', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('CSR Legacy MultiVector', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True ) +#df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True ) +#df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True ) +#df.drop(columns=('CSR Legacy Scalar', 'CPU','speedup'), axis=1, inplace=True ) +#df.drop(columns=('CSR Legacy Stream', 'CPU','speedup'), axis=1, inplace=True ) +#df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('ChunkedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('Ellpack', 'CPU','speedup'), axis=1, inplace=True ) df.drop(columns=('Ellpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) @@ -81,27 +80,27 @@ df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True print( "Computing speed-up of formats...") # Add speedup compared to CSR and cuSparse -df["BiEllpack Legacy", "CPU", "CSR speedup"] = df["BiEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["BiEllpack Legacy", "GPU", "cuSparse speedup"] = df["BiEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["BiEllpack", "CPU", "CSR speedup"] = df["BiEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] -df["BiEllpacky", "GPU", "cuSparse speedup"] = df["BiEllpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR", "GPU", "cuSparse speedup"] = df["CSR", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Adaptive", "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Light", "GPU", "cuSparse speedup"] = df["CSR Legacy Light", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Scalar", "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy LightWithoutAtomic","GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic","GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy MultiVector","GPU", "cuSparse speedup"] = df["CSR Legacy MultiVector","GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Vector", "GPU", "cuSparse speedup"] = df["CSR Legacy Vector", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["ChunkedEllpack Legacy", "CPU", "CSR speedup"] = df["ChunkedEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["ChunkedEllpack Legacy", "GPU", "cuSparse speedup"] = df["ChunkedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["Ellpack Legacy", "CPU", "CSR speedup"] = df["Ellpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["Ellpack Legacy", "GPU", "cuSparse speedup"] = df["Ellpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["Ellpack", "CPU", "CSR speedup"] = df["Ellpack", "CPU", "time"] / df["CSR", "CPU", "time"] -df["Ellpack", "GPU", "cuSparse speedup"] = df["Ellpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["SlicedEllpack Legacy", "CPU", "CSR speedup"] = df["SlicedEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["SlicedEllpack Legacy", "GPU", "cuSparse speedup"] = df["SlicedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["SlicedEllpack", "CPU", "CSR speedup"] = df["SlicedEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] -df["SlicedEllpack", "GPU", "cuSparse speedup"] = df["SlicedEllpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["BiEllpack Legacy", "CPU", "CSR speedup"] = df["BiEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] +df["BiEllpack Legacy", "GPU", "cuSparse speedup"] = df["BiEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["BiEllpack", "CPU", "CSR speedup"] = df["BiEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] +df["BiEllpacky", "GPU", "cuSparse speedup"] = df["BiEllpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR", "GPU", "cuSparse speedup"] = df["CSR", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Adaptive", "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Light", "GPU", "cuSparse speedup"] = df["CSR Legacy Light", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy LightWithoutAtomic", "GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Scalar", "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Vector", "GPU", "cuSparse speedup"] = df["CSR Legacy Vector", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy MultiVector", "GPU", "cuSparse speedup"] = df["CSR Legacy MultiVector", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["ChunkedEllpack Legacy", "CPU", "CSR speedup"] = df["ChunkedEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] +df["ChunkedEllpack Legacy", "GPU", "cuSparse speedup"] = df["ChunkedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["Ellpack Legacy", "CPU", "CSR speedup"] = df["Ellpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] +df["Ellpack Legacy", "GPU", "cuSparse speedup"] = df["Ellpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["Ellpack", "CPU", "CSR speedup"] = df["Ellpack", "CPU", "time"] / df["CSR", "CPU", "time"] +df["Ellpack", "GPU", "cuSparse speedup"] = df["Ellpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["SlicedEllpack Legacy", "CPU", "CSR speedup"] = df["SlicedEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] +df["SlicedEllpack Legacy", "GPU", "cuSparse speedup"] = df["SlicedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["SlicedEllpack", "CPU", "CSR speedup"] = df["SlicedEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] +df["SlicedEllpack", "GPU", "cuSparse speedup"] = df["SlicedEllpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] # Add speedup compared to legacy formats df["CSR", "GPU", "Legacy speedup"] = df["CSR", "GPU", "time"] / df["CSR Legacy Scalar", "GPU", "time"] @@ -120,12 +119,12 @@ df.to_html("log.html") # extract columns of reference formats on GPU print( "Preparing data for graph analysis..." ) df['cuSparse-bandwidth' ] = df[ 'cuSparse','GPU','bandwidth'] -#df['csr-legacy-adaptive-bandwidth' ] = df[ 'CSR Legacy Adaptive','GPU','bandwidth'] -#df['csr-legacy-light-bandwidth' ] = df[ 'CSR Legacy Light','GPU','bandwidth'] -#df['csr-legacy-light-without-atomic-bandwidth' ] = df[ 'CSR Legacy LightWithoutAtomic','GPU','bandwidth'] -#df['csr-legacy-scalar-bandwidth' ] = df[ 'CSR Legacy Scalar','GPU','bandwidth'] -#df['csr-legacy-vector-bandwidth' ] = df[ 'CSR Legacy Vector','GPU','bandwidth'] -#df['csr-legacy-multi-vector-bandwidth' ] = df[ 'CSR Legacy MultiVector','GPU','bandwidth'] +df['csr-legacy-adaptive-bandwidth' ] = df[ 'CSR Legacy Adaptive','GPU','bandwidth'] +df['csr-legacy-light-bandwidth' ] = df[ 'CSR Legacy Light','GPU','bandwidth'] +df['csr-legacy-light-without-atomic-bandwidth' ] = df[ 'CSR Legacy LightWithoutAtomic','GPU','bandwidth'] +df['csr-legacy-scalar-bandwidth' ] = df[ 'CSR Legacy Scalar','GPU','bandwidth'] +df['csr-legacy-vector-bandwidth' ] = df[ 'CSR Legacy Vector','GPU','bandwidth'] +df['csr-legacy-multi-vector-bandwidth' ] = df[ 'CSR Legacy MultiVector','GPU','bandwidth'] df['ellpack-bandwidth' ] = df[ 'Ellpack','GPU','bandwidth'] df['sliced-ellpack-bandwidth' ] = df[ 'SlicedEllpack','GPU','bandwidth'] df['chunked-ellpack-bandwidth' ] = df[ 'ChunkedEllpack','GPU','bandwidth'] @@ -134,12 +133,12 @@ df['bi-ellpack-bandwidth' ] = df[ 'BiEllpack','GPU','bandwi # sort by cuSparse df.sort_values(by=["cuSparse-bandwidth"],inplace=True,ascending=False) cuSparse_list = df['cuSparse-bandwidth'].tolist() -#cuSparse_csr_legacy_adaptive_gpu_list = df[ "CSR Legacy Adaptive", "GPU", "bandwidth"].tolist(); -#cuSparse_csr_legacy_light_gpu_list = df[ "CSR Legacy Light", "GPU", "bandwidth"].tolist(); -#cuSparse_csr_legacy_light_without_atomic_gpu_list = df[ "CSR Legacy LightWithoutAtomic", "GPU", "bandwidth"].tolist(); -#cuSparse_csr_legacy_scalar_gpu_list = df[ "CSR Legacy Scalar", "GPU", "bandwidth"].tolist(); -#cuSparse_csr_legacy_vector_gpu_list = df[ "CSR Legacy Vector", "GPU", "bandwidth"].tolist(); -#cuSparse_csr_legacy_multivector_gpu_list = df[ "CSR Legacy MultiVector", "GPU", "bandwidth"].tolist(); +cuSparse_csr_legacy_adaptive_gpu_list = df[ "CSR Legacy Adaptive", "GPU", "bandwidth"].tolist(); +cuSparse_csr_legacy_light_gpu_list = df[ "CSR Legacy Light", "GPU", "bandwidth"].tolist(); +cuSparse_csr_legacy_light_without_atomic_gpu_list = df[ "CSR Legacy LightWithoutAtomic", "GPU", "bandwidth"].tolist(); +cuSparse_csr_legacy_scalar_gpu_list = df[ "CSR Legacy Scalar", "GPU", "bandwidth"].tolist(); +cuSparse_csr_legacy_vector_gpu_list = df[ "CSR Legacy Vector", "GPU", "bandwidth"].tolist(); +cuSparse_csr_legacy_multivector_gpu_list = df[ "CSR Legacy MultiVector", "GPU", "bandwidth"].tolist(); cuSparse_ellpack_gpu_list = df[ "Ellpack", "GPU", "bandwidth"].tolist(); cuSparse_ellpack_legacy_gpu_list = df[ "Ellpack Legacy", "GPU", "bandwidth"].tolist(); cuSparse_sliced_ellpack_gpu_list = df[ "SlicedEllpack", "GPU", "bandwidth"].tolist(); @@ -178,12 +177,12 @@ cuSparse_file = open( "cusparse.gplt", "w" ) i = 0 for x in cuSparse_list: if str( x ) != "nan": - if ( #str( cuSparse_csr_legacy_adaptive_gpu_list[ i ] ) != "nan" and - #str( cuSparse_csr_legacy_light_gpu_list[ i ] ) != "nan" and - #str( cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ] ) != "nan" and - #str( cuSparse_csr_legacy_scalar_gpu_list[ i ] ) != "nan" and - #str( cuSparse_csr_legacy_vector_gpu_list[ i ] ) != "nan" and - #str( cuSparse_csr_legacy_multivector_gpu_list[ i ] ) != "nan" and + if ( str( cuSparse_csr_legacy_adaptive_gpu_list[ i ] ) != "nan" and + str( cuSparse_csr_legacy_light_gpu_list[ i ] ) != "nan" and + str( cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ] ) != "nan" and + str( cuSparse_csr_legacy_scalar_gpu_list[ i ] ) != "nan" and + str( cuSparse_csr_legacy_vector_gpu_list[ i ] ) != "nan" and + str( cuSparse_csr_legacy_multivector_gpu_list[ i ] ) != "nan" and str( cuSparse_ellpack_gpu_list[ i ] ) != "nan" and str( cuSparse_ellpack_legacy_gpu_list[ i ] ) != "nan" and str( cuSparse_sliced_ellpack_gpu_list[ i ] ) != "nan" and @@ -193,12 +192,12 @@ for x in cuSparse_list: str( cuSparse_bi_ellpack_gpu_list[ i ] ) != "nan" and str( cuSparse_bi_ellpack_legacy_gpu_list[ i ] ) != "nan" ): cuSparse_file.write( f"{i+1} {x} " ) # 1 2 - cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_adaptive_gpu_list[ i ]} " ) # 3 - cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_light_gpu_list[ i ]} " ) # 4 - cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " ) # 5 - cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " ) # 6 - cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_vector_gpu_list[ i ]} " ) # 7 - cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_multivector_gpu_list[ i ]} " ) # 8 + cuSparse_file.write( f"{cuSparse_csr_legacy_adaptive_gpu_list[ i ]} " ) # 3 + cuSparse_file.write( f"{cuSparse_csr_legacy_light_gpu_list[ i ]} " ) # 4 + cuSparse_file.write( f"{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " ) # 5 + cuSparse_file.write( f"{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " ) # 6 + cuSparse_file.write( f"{cuSparse_csr_legacy_vector_gpu_list[ i ]} " ) # 7 + cuSparse_file.write( f"{cuSparse_csr_legacy_multivector_gpu_list[ i ]} " ) # 8 cuSparse_file.write( f"{cuSparse_ellpack_gpu_list[ i ]} {cuSparse_ellpack_legacy_gpu_list[ i ]} " ) # 9 10 cuSparse_file.write( f"{cuSparse_sliced_ellpack_gpu_list[ i ]} {cuSparse_sliced_ellpack_legacy_gpu_list[ i ]} " ) # 11 12 cuSparse_file.write( f"{cuSparse_chunked_ellpack_gpu_list[ i ]} {cuSparse_chunked_ellpack_legacy_gpu_list[ i ]} " ) # 13 14 @@ -252,36 +251,36 @@ set grid set xlabel 'Matrix' set xtics 250 set ylabel 'Bandwidth GB/sec' -#set output 'csr-legacy-adaptive-vs-cusparse.eps' -#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ -# 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ -# 'cusparse.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'green', \ -# 'cusparse.gplt' using 1:3 title 'CSR Legacy Adaptive' with lines linewidth 0.5 lt rgb 'green', -#set output 'csr-legacy-light-vs-cusparse.eps' -#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ -# 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ -# 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ -# 'cusparse.gplt' using 1:4 title 'CSR Legacy Light' with lines linewidth 0.5 lt rgb 'green', -#set output 'csr-legacy-light-without-atomic-vs-cusparse.eps' -#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ -# 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ -# 'cusparse.gplt' using 1:5 title '' with dots linewidth 2 lt rgb 'green', \ -# 'cusparse.gplt' using 1:5 title 'CSR Legacy LightWithoutAtomic' with lines linewidth 0.5 lt rgb 'green', -#set output 'csr-legacy-scalar-vs-cusparse.eps' -#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ -# 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ -# 'cusparse.gplt' using 1:6 title '' with dots linewidth 2 lt rgb 'green', \ -# 'cusparse.gplt' using 1:6 title 'CSR Legacy Scalar' with lines linewidth 0.5 lt rgb 'green', -#set output 'csr-legacy-vector-vs-cusparse.eps' -#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ -# 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ -# 'cusparse.gplt' using 1:7 title '' with dots linewidth 2 lt rgb 'green', \ -# 'cusparse.gplt' using 1:7 title 'CSR Legacy Vector' with lines linewidth 0.5 lt rgb 'green', -#set output 'csr-legacy-multivector-vs-cusparse.eps' -#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ -# 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ -# 'cusparse.gplt' using 1:8 title '' with dots linewidth 2 lt rgb 'green', \ -# 'cusparse.gplt' using 1:8 title 'CSR Legacy MultiVector' with lines linewidth 0.5 lt rgb 'green', +set output 'csr-legacy-adaptive-vs-cusparse.eps' +plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ + 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ + 'cusparse.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:3 title 'CSR Legacy Adaptive' with lines linewidth 0.5 lt rgb 'green', +set output 'csr-legacy-light-vs-cusparse.eps' +plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ + 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ + 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:4 title 'CSR Legacy Light' with lines linewidth 0.5 lt rgb 'green', +set output 'csr-legacy-light-without-atomic-vs-cusparse.eps' +plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ + 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ + 'cusparse.gplt' using 1:5 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:5 title 'CSR Legacy LightWithoutAtomic' with lines linewidth 0.5 lt rgb 'green', +set output 'csr-legacy-scalar-vs-cusparse.eps' +plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ + 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ + 'cusparse.gplt' using 1:6 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:6 title 'CSR Legacy Scalar' with lines linewidth 0.5 lt rgb 'green', +set output 'csr-legacy-vector-vs-cusparse.eps' +plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ + 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ + 'cusparse.gplt' using 1:7 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:7 title 'CSR Legacy Vector' with lines linewidth 0.5 lt rgb 'green', +set output 'csr-legacy-multivector-vs-cusparse.eps' +plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ + 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ + 'cusparse.gplt' using 1:8 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:8 title 'CSR Legacy MultiVector' with lines linewidth 0.5 lt rgb 'green', set output 'ellpack-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ @@ -337,12 +336,12 @@ print( "Executing Gnuplot ..." ) os.system( "gnuplot gnuplot.gplt" ) print( "Converting files to PDF ..." ) -#os.system( "epstopdf --autorotate All csr-legacy-adaptive-vs-cusparse.eps" ) -#os.system( "epstopdf --autorotate All csr-legacy-light-vs-cusparse.eps" ) -#os.system( "epstopdf --autorotate All csr-legacy-light-without-atomic-vs-cusparse.eps" ) -#os.system( "epstopdf --autorotate All csr-legacy-scalar-vs-cusparse.eps" ) -#os.system( "epstopdf --autorotate All csr-legacy-vector-vs-cusparse.eps" ) -#os.system( "epstopdf --autorotate All csr-legacy-multivector-vs-cusparse.eps" ) +os.system( "epstopdf --autorotate All csr-legacy-adaptive-vs-cusparse.eps" ) +os.system( "epstopdf --autorotate All csr-legacy-light-vs-cusparse.eps" ) +os.system( "epstopdf --autorotate All csr-legacy-light-without-atomic-vs-cusparse.eps" ) +os.system( "epstopdf --autorotate All csr-legacy-scalar-vs-cusparse.eps" ) +os.system( "epstopdf --autorotate All csr-legacy-vector-vs-cusparse.eps" ) +os.system( "epstopdf --autorotate All csr-legacy-multivector-vs-cusparse.eps" ) os.system( "epstopdf --autorotate All ellpack-vs-cusparse.eps" ) os.system( "epstopdf --autorotate All sliced-ellpack-vs-cusparse.eps" ) os.system( "epstopdf --autorotate All chunked-ellpack-vs-cusparse.eps" ) -- GitLab From 088d44daaaa7846c17c391fb755c09e8c9fafc27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 1 Jul 2020 14:16:58 +0200 Subject: [PATCH 18/57] One more fix of linking -lcudadevrt. --- src/Benchmarks/SpMV/CMakeLists.txt | 2 +- src/UnitTests/Matrices/Legacy/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt index 7adbd8ffd..6af696534 100644 --- a/src/Benchmarks/SpMV/CMakeLists.txt +++ b/src/Benchmarks/SpMV/CMakeLists.txt @@ -1,6 +1,6 @@ if( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu ) - TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} -lcudadevrt ) + TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} ) else() ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp ) endif() diff --git a/src/UnitTests/Matrices/Legacy/CMakeLists.txt b/src/UnitTests/Matrices/Legacy/CMakeLists.txt index 004971c13..2e7297cce 100644 --- a/src/UnitTests/Matrices/Legacy/CMakeLists.txt +++ b/src/UnitTests/Matrices/Legacy/CMakeLists.txt @@ -15,7 +15,7 @@ IF( BUILD_CUDA ) TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_ChunkedEllpack ${GTEST_BOTH_LIBRARIES} ) CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_CSR SparseMatrixTest_CSR.cu OPTIONS ${CXX_TESTS_FLAGS} ) - TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} -lcudadevrt ) + TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} ${CUDA_cudadevrt_LIBRARY} ) CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_Ellpack SparseMatrixTest_Ellpack.cu OPTIONS ${CXX_TESTS_FLAGS} ) TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_Ellpack ${GTEST_BOTH_LIBRARIES} ) -- GitLab From 91bd4f88d8d996fb9b5fb59599713dc02a516591 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 7 Jul 2020 14:18:56 +0200 Subject: [PATCH 19/57] Fixed linking of DistributedMatrixTest. --- src/UnitTests/Matrices/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index c88f565eb..778ab29bd 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -137,7 +137,7 @@ if( ${BUILD_MPI} ) if( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( DistributedMatrixTest DistributedMatrixTest.cu OPTIONS ${CXX_TESTS_FLAGS} ) - TARGET_LINK_LIBRARIES( DistributedMatrixTest ${GTEST_BOTH_LIBRARIES} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a ) + TARGET_LINK_LIBRARIES( DistributedMatrixTest ${GTEST_BOTH_LIBRARIES} ${CUDA_cudadevrt_LIBRARY} ) else() ADD_EXECUTABLE( DistributedMatrixTest DistributedMatrixTest.cpp ) TARGET_COMPILE_OPTIONS( DistributedMatrixTest PRIVATE ${CXX_TESTS_FLAGS} ) -- GitLab From 8c3add461623f7417fa8f538ee65195ab04a6ce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 7 Jul 2020 14:43:03 +0200 Subject: [PATCH 20/57] Added exceptions handling to tnl-benchmark-spmv. --- src/Benchmarks/SpMV/tnl-benchmark-spmv.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h index d8e2003fb..82e1f12cd 100644 --- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h +++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h @@ -25,6 +25,7 @@ #include using namespace TNL::Matrices; +#include #include // Used for file naming, so logs don't get overwritten. using namespace TNL; @@ -44,7 +45,12 @@ runSpMVBenchmarks( Benchmark & benchmark, benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")", metadata ); // Start the actual benchmark in spmv.h - SpMVLegacy::benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, verboseMR ); + try { + SpMVLegacy::benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, verboseMR ); + } + catch( const std::exception& ex ) { + std::cerr << ex.what() << std::endl; + } } // Get current date time to have different log files names and avoid overwriting. -- GitLab From 49590408168302ddf8d867b722542a109a1b33d5 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Wed, 1 Jul 2020 23:49:47 +0200 Subject: [PATCH 21/57] Increased block sizes, optimizations for CSR Light, new logic for CSR Dynamic --- src/TNL/Matrices/Legacy/CSR_impl.h | 81 +++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 24 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 0cc3c312e..deb9483aa 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -817,9 +817,7 @@ void SpMVCSRAdaptive( const Real *inVector, const Index sharedPerWarp, const Index maxPerWarp) { - // extern __shared__ Real shared_res[]; - constexpr Index SHARED = 49152/sizeof(Real); - __shared__ Real shared_res[SHARED]; + __shared__ Real shared_res[49152/sizeof(Real)]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; Real result = 0; @@ -878,7 +876,6 @@ void SpMVCSRAdaptive( const Real *inVector, } else { // too long row /////////////////////////////////////* CSR DYNAMIC VECTOR *////////////// - constexpr Index THREADS_PER_BLOCK = 1024; constexpr Index ELEMENTS_PER_WARP = 1024; constexpr Index WARPS_PER_BLOCK = ELEMENTS_PER_WARP / warpSize; /* Number of warps we need. @@ -888,7 +885,7 @@ void SpMVCSRAdaptive( const Real *inVector, /* Execute a lot of CSR Vector */ if (laneID == 0) { - spmvCSRVectorHelper <<>>( + spmvCSRVectorHelper <<>>( inVector, columnIndexes, values, @@ -1127,10 +1124,10 @@ void SpMVCSRScalarPrepare( const Real *inVector, const Real* values, const Index rows, const Index getColumns) { - const Index threads = 256; + const Index threads = 1024; // block size size_t neededThreads = rows; Index blocks; - + /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); @@ -1163,10 +1160,10 @@ void SpMVCSRVectorPrepare( const Real *inVector, const Real* values, const Index rows, const Index getColumns) { - const Index threads = 256; + const Index threads = 1024; // block size size_t neededThreads = rows * warpSize; Index blocks; - + /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); @@ -1200,7 +1197,7 @@ void SpMVCSRLightPrepare( const Real *inVector, const Index valuesSize, const Index rows, const Index getColumns) { - const Index threads = 256; + const Index threads = 1024; // block size Index blocks, groupSize; /* Copy rowCnt to GPU */ unsigned rowCnt = 0; @@ -1222,7 +1219,7 @@ void SpMVCSRLightPrepare( const Real *inVector, groupSize = 32; size_t neededThreads = groupSize * rows; - + /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); @@ -1232,7 +1229,19 @@ void SpMVCSRLightPrepare( const Real *inVector, neededThreads -= MAX_X_DIM * threads; } - SpMVCSRLight<<>>( + if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector + SpMVCSRVector<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + rows, + getColumns, + grid + ); + } else { + SpMVCSRLight<<>>( inVector, outVector, rowPointers, @@ -1243,7 +1252,8 @@ void SpMVCSRLightPrepare( const Real *inVector, groupSize, grid, kernelRowCnt - ); + ); + } } cudaFree(kernelRowCnt); @@ -1260,7 +1270,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, const Index valuesSize, const Index rows, const Index getColumns) { - const Index threads = 256; + const Index threads = 1024; // block size size_t neededThreads = rows * warpSize; Index blocks, groupSize; @@ -1277,7 +1287,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, groupSize = 32; neededThreads = groupSize * rows; - + /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); @@ -1287,7 +1297,8 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, neededThreads -= MAX_X_DIM * threads; } - SpMVCSRLightWithoutAtomic<<>>( + if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector + SpMVCSRVector<<>>( inVector, outVector, rowPointers, @@ -1295,9 +1306,21 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, values, rows, getColumns, - groupSize, grid - ); + ); + } else { + SpMVCSRLightWithoutAtomic<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + rows, + getColumns, + groupSize, + grid + ); + } } } @@ -1315,13 +1338,14 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, /* Configuration */ constexpr int ELEMENTS_PER_WARP = 1024; // how many elements should process every warp //---------------------------------------------------------------------------------- - const Index threads = 256; + const Index threads = 1024; // block size Index blocks; const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row const size_t neededWarps = roundUpDivision(nnz, ELEMENTS_PER_WARP); // warps per row const Index offset = neededWarps * ELEMENTS_PER_WARP; size_t neededThreads = offset * rows; + /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); @@ -1370,15 +1394,22 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, const Index* rowPointers, const Index* columnIndexes, const Real* values, + const Index valuesSize, const Index rows, const Index getColumns) { /* Configuration ---------------------------------------------------*/ - constexpr size_t THREADS_PER_BLOCK = 1024; + /* Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache + 512 threads per block for double (12 elements per thread) */ + constexpr size_t THREADS_PER_BLOCK = sizeof(Real) == 4 ? 1024 : 512; constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32; - constexpr Index SHARED = 49152/sizeof(Real); + constexpr Index SHARED = 49152/sizeof(Real); constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; - constexpr Index MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic //-------------------------------------------------------------------- + /* max elements per warp to start CSR Vector Dynamic (using of dynamic parallelism) */ + Index maxPerWarp = roundUpDivision(valuesSize, rows); + if (maxPerWarp < 4096) + maxPerWarp = 4096; + Index blocks; const Index threads = THREADS_PER_BLOCK; std::vector inBlock; @@ -1414,7 +1445,8 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, cudaMalloc((void **)&blocksAdaptive, sizeof(Index) * inBlock.size()); cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(Index), cudaMemcpyHostToDevice); - size_t neededThreads = inBlock.size() * 32; + size_t neededThreads = inBlock.size() * 32; // one warp per block + /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++i) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); @@ -1435,7 +1467,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, getColumns, grid, SHARED_PER_WARP, - MAX_PER_WARP + maxPerWarp ); } @@ -1676,6 +1708,7 @@ class CSRDeviceDependentCode< Devices::Cuda > kernelRowPointers, kernelColumns, kernelValues, + matrix.getValues().getSize(), matrix.getRowPointers().getSize() - 1, matrix.getColumns() ); -- GitLab From 91711b6aab8bb9e1b4cb480c7553a3612a449f24 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Thu, 2 Jul 2020 23:11:35 +0200 Subject: [PATCH 22/57] Changed CSR Adaptive --- src/TNL/Matrices/Legacy/CSR_impl.h | 129 +++++++++-------------------- 1 file changed, 40 insertions(+), 89 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index deb9483aa..84e1a799a 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -814,9 +814,7 @@ void SpMVCSRAdaptive( const Real *inVector, Index blocks_size, Index getColumns, Index gridID, - const Index sharedPerWarp, - const Index maxPerWarp) -{ + const Index sharedPerWarp) { __shared__ Real shared_res[49152/sizeof(Real)]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; @@ -829,7 +827,7 @@ void SpMVCSRAdaptive( const Real *inVector, const Index maxRow = blocks[blockIdx + 1]; const Index minID = rowPointers[minRow]; const Index maxID = rowPointers[maxRow]; - Index i, to; + Index i, to, column; /* rows per block more than 1 */ if ((maxRow - minRow) > 1) { /////////////////////////////////////* CSR STREAM *////////////// @@ -838,35 +836,31 @@ void SpMVCSRAdaptive( const Real *inVector, Index elementID = laneID + minID; Index sharedID = laneID + offset; // index for shared memory for (; elementID < maxID; elementID += warpSize, sharedID += warpSize) { - if (columnIndexes[elementID] >= getColumns) + column = columnIndexes[elementID]; + if (column >= getColumns) continue; // can't be break - shared_res[sharedID] = values[elementID] * inVector[columnIndexes[elementID]]; + shared_res[sharedID] = values[elementID] * inVector[column]; } - const Index row = minRow + laneID; - if (row >= maxRow) - return; - /* Calculate result */ - sharedID = rowPointers[row] - minID + offset; // start of preprocessed results in shared memory - to = rowPointers[row + 1] - minID + offset; // end of preprocessed data - for (; sharedID < to; ++sharedID) - result += shared_res[sharedID]; + for (Index row = minRow + laneID; row < maxRow; row += warpSize) { + to = rowPointers[row + 1] - minID + offset; // end of preprocessed data + /* Scalar reduction */ + for (sharedID = rowPointers[row] - minID + offset; sharedID < to; ++sharedID) + result += shared_res[sharedID]; - outVector[row] = result; // Write result - return; - } - - const Index elements = maxID - minID; - if (elements <= maxPerWarp) { + outVector[row] = result; // Write result + } + } else { /////////////////////////////////////* CSR VECTOR *////////////// for (i = minID + laneID; i < maxID; i += warpSize) { - if (columnIndexes[i] >= getColumns) + column = columnIndexes[i]; + if (column >= getColumns) break; - result += values[i] * inVector[columnIndexes[i]]; + result += values[i] * inVector[column]; } - /* Reduction */ + /* Parallel reduction */ result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); @@ -874,40 +868,6 @@ void SpMVCSRAdaptive( const Real *inVector, result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); if (laneID == 0) outVector[minRow] = result; // Write result } - else { // too long row - /////////////////////////////////////* CSR DYNAMIC VECTOR *////////////// - constexpr Index ELEMENTS_PER_WARP = 1024; - constexpr Index WARPS_PER_BLOCK = ELEMENTS_PER_WARP / warpSize; - /* Number of warps we need. - This warp can be used to calculate result too, -1 warp */ - const Index warps = roundUpDivision(elements, ELEMENTS_PER_WARP) - 1; - const Index blocks = roundUpDivision(warps, WARPS_PER_BLOCK); - - /* Execute a lot of CSR Vector */ - if (laneID == 0) { - spmvCSRVectorHelper <<>>( - inVector, - columnIndexes, - values, - getColumns, - &outVector[minRow], - minID + ELEMENTS_PER_WARP, - maxID, - ELEMENTS_PER_WARP - ); - } - - /* CSR Vector */ - to = minID + ELEMENTS_PER_WARP; - for (i = minID + laneID; i < to; i += warpSize) { - if (columnIndexes[i] >= getColumns) - break; - - result += values[i] * inVector[columnIndexes[i]]; - } - /* Write result */ - atomicAdd(&outVector[minRow], result); - } } template< typename Real, @@ -921,8 +881,7 @@ void SpMVCSRScalar( const Real *inVector, const Real* values, const Index rows, const Index getColumns, - const Index gridID) -{ + const Index gridID) { const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; if (index >= rows) return; @@ -931,10 +890,11 @@ void SpMVCSRScalar( const Real *inVector, const Index endID = rowPointers[index + 1]; for (Index i = rowPointers[index]; i < endID; ++i) { - if (columnIndexes[i] >= getColumns) + const Index column = columnIndexes[i]; + if (column >= getColumns) break; - result += values[i] * inVector[columnIndexes[i]]; + result += values[i] * inVector[column]; } outVector[index] = result; @@ -967,10 +927,11 @@ void SpMVCSRMultiVector( const Real *inVector, /* Calculate result */ for (Index i = rowPointers[rowID] + inRowID; i < endID; i += offset) { - if (columnIndexes[i] >= getColumns) + Index column = columnIndexes[i]; + if (column >= getColumns) break; - result += values[i] * inVector[columnIndexes[i]]; + result += values[i] * inVector[column]; } /* Reduction */ @@ -1007,10 +968,11 @@ void SpMVCSRVector( const Real *inVector, /* Calculate result */ for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) { - if (columnIndexes[i] >= getColumns) + Index column = columnIndexes[i]; + if (column >= getColumns) break; - result += values[i] * inVector[columnIndexes[i]]; + result += values[i] * inVector[column]; } /* Reduction */ @@ -1059,10 +1021,11 @@ void SpMVCSRLight( const Real *inVector, Real result = 0.0; for (i = minID + inGroupID; i < maxID; i += groupSize) { - if (columnIndexes[i] >= getColumns) + const Index column = columnIndexes[i]; + if (column >= getColumns) break; - result += values[i] * inVector[columnIndexes[i]]; + result += values[i] * inVector[column]; } /* Parallel reduction */ @@ -1100,10 +1063,11 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector, Real result = 0.0; for (i = minID + inGroupID; i < maxID; i += groupSize) { - if (columnIndexes[i] >= getColumns) + const Index column = columnIndexes[i]; + if (column >= getColumns) break; - result += values[i] * inVector[columnIndexes[i]]; + result += values[i] * inVector[column]; } /* Parallel reduction */ @@ -1405,37 +1369,25 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, constexpr Index SHARED = 49152/sizeof(Real); constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; //-------------------------------------------------------------------- - /* max elements per warp to start CSR Vector Dynamic (using of dynamic parallelism) */ - Index maxPerWarp = roundUpDivision(valuesSize, rows); - if (maxPerWarp < 4096) - maxPerWarp = 4096; - Index blocks; const Index threads = THREADS_PER_BLOCK; + + /* Fill blocks */ std::vector inBlock; inBlock.push_back(0); Index sum = 0; Index i, prev_i = 0; - for (i = 1; i < rows - 1; ++i) { Index elements = matrix.getRowPointers().getElement(i) - matrix.getRowPointers().getElement(i - 1); sum += elements; if (sum > SHARED_PER_WARP) { - if (i - prev_i == 1) { - inBlock.push_back(i); - } else { - inBlock.push_back(i - 1); - --i; - } - sum = 0; - prev_i = i; - continue; - } - if (i - prev_i == warpSize) { + if (i - prev_i > 1) // this is extra row + --i; + inBlock.push_back(i); - prev_i = i; sum = 0; + prev_i = i; } } inBlock.push_back(rows); @@ -1466,8 +1418,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, inBlock.size() - 1, // -1 here is better than -1 in kernel getColumns, grid, - SHARED_PER_WARP, - maxPerWarp + SHARED_PER_WARP ); } -- GitLab From 1e665db08f0f2938447d36f36a81fcc78c7c8487 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sun, 5 Jul 2020 16:06:32 +0200 Subject: [PATCH 23/57] Memory optimizations, changes to CSR Adaptive and unit tests --- src/TNL/Matrices/Legacy/CSR_impl.h | 354 ++++++++---------- .../Matrices/Legacy/SparseMatrixTest.hpp | 18 +- 2 files changed, 172 insertions(+), 200 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 84e1a799a..7f627e7b9 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -23,7 +23,21 @@ #include #endif +template +struct Block { + Block(Index row, Index index = 0) noexcept { + this->index = index; + this->row = row; + } + + Index index; + Index row; +}; + +/* Configuration */ constexpr size_t MAX_X_DIM = 2147483647; +constexpr int ELEMENTS_PER_WARP = 1024; +//----------------------------------------------------------------- namespace TNL { namespace Matrices { @@ -768,39 +782,6 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& } } -template< typename Real, - typename Index, - int warpSize > -__global__ -void spmvCSRVectorHelper(const Real *inVector, - const Index* columnIndexes, - const Real *values, - const Index getColumns, - Real *out, - const Index from, - const Index to, - const Index perWarp) -{ - const Index index = blockIdx.x * blockDim.x + threadIdx.x; - const Index warpID = index / warpSize; - const Index minID = from + warpID * perWarp; - if (minID >= to) return; - - Index maxID = from + (warpID + 1) * perWarp; - if (maxID >= to ) maxID = to; - - const Index laneID = index % warpSize; - - Real result = 0.0; - for (Index i = minID + laneID; i < maxID; i += warpSize) { - if (columnIndexes[i] >= getColumns) - break; - result += values[i] * inVector[columnIndexes[i]]; - } - - atomicAdd(out, result); -} - template< typename Real, typename Index, int warpSize > @@ -810,7 +791,7 @@ void SpMVCSRAdaptive( const Real *inVector, const Index* rowPointers, const Index* columnIndexes, const Real* values, - Index *blocks, + Block *blocks, Index blocks_size, Index getColumns, Index gridID, @@ -818,21 +799,46 @@ void SpMVCSRAdaptive( const Real *inVector, __shared__ Real shared_res[49152/sizeof(Real)]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; - Real result = 0; if (blockIdx >= blocks_size) return; + Real result = 0.0; const Index laneID = index % warpSize; - const Index minRow = blocks[blockIdx]; - const Index maxRow = blocks[blockIdx + 1]; + const Index minRow = blocks[blockIdx].row; + const Index maxRow = blocks[blockIdx + 1].row; const Index minID = rowPointers[minRow]; - const Index maxID = rowPointers[maxRow]; - Index i, to, column; + Index maxID = rowPointers[maxRow]; + Index i, to, column, offset; + Index elements = maxID - minID; /* rows per block more than 1 */ - if ((maxRow - minRow) > 1) { + if (elements == 0 || elements > ELEMENTS_PER_WARP) { + /////////////////////////////////////* CSR VECTOR L *///////////// + const Index warpInRow = blocks[blockIdx].index; + if (elements == 0) maxID = rowPointers[minRow + 1]; + + offset = warpInRow * ELEMENTS_PER_WARP; + to = minID + (warpInRow + 1) * ELEMENTS_PER_WARP; + if (to > maxID) to = maxID; + + for (i = minID + offset + laneID; i < to; i += warpSize) { + column = columnIndexes[i]; + if (column >= getColumns) + break; + + result += values[i] * inVector[column]; + } + + /* Parallel reduction */ + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); + if (laneID == 0) atomicAdd(&outVector[minRow], result); + } else if (elements <= sharedPerWarp) { /////////////////////////////////////* CSR STREAM *////////////// /* Copy and calculate elements from global to shared memory, coalesced */ - const Index offset = threadIdx.x / warpSize * sharedPerWarp; + offset = threadIdx.x / warpSize * sharedPerWarp; Index elementID = laneID + minID; Index sharedID = laneID + offset; // index for shared memory for (; elementID < maxID; elementID += warpSize, sharedID += warpSize) { @@ -845,6 +851,7 @@ void SpMVCSRAdaptive( const Real *inVector, /* Calculate result */ for (Index row = minRow + laneID; row < maxRow; row += warpSize) { to = rowPointers[row + 1] - minID + offset; // end of preprocessed data + result = 0; /* Scalar reduction */ for (sharedID = rowPointers[row] - minID + offset; sharedID < to; ++sharedID) result += shared_res[sharedID]; @@ -1019,7 +1026,6 @@ void SpMVCSRLight( const Real *inVector, maxID = rowPointers[row + 1]; Real result = 0.0; - for (i = minID + inGroupID; i < maxID; i += groupSize) { const Index column = columnIndexes[i]; if (column >= getColumns) @@ -1029,7 +1035,7 @@ void SpMVCSRLight( const Real *inVector, } /* Parallel reduction */ - for (Index i = groupSize / 2; i > 0; i /= 2) + for (i = groupSize / 2; i > 0; i /= 2) result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); /* Write result */ if (inGroupID == 0) @@ -1052,7 +1058,6 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector, const Index gridID) { const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index row = index / groupSize; - Index i; if (row >= rows) return; @@ -1062,7 +1067,7 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector, const Index maxID = rowPointers[row + 1]; Real result = 0.0; - for (i = minID + inGroupID; i < maxID; i += groupSize) { + for (Index i = minID + inGroupID; i < maxID; i += groupSize) { const Index column = columnIndexes[i]; if (column >= getColumns) break; @@ -1300,7 +1305,6 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, const Index rows, const Index getColumns) { /* Configuration */ - constexpr int ELEMENTS_PER_WARP = 1024; // how many elements should process every warp //---------------------------------------------------------------------------------- const Index threads = 1024; // block size Index blocks; @@ -1347,6 +1351,30 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, } } +/* Find limit of block */ +template< typename Real, + typename Index, + typename Device, + CSRKernel KernelType> +Index findLimit(const Index start, const Index max, + const CSR< Real, Device, Index, KernelType >& matrix, + const Index size) { + Index sum = 0; + for (Index current = start; current < size - 1; ++current) { + Index elements = matrix.getRowPointers().getElement(current + 1) - + matrix.getRowPointers().getElement(current); + sum += elements; + if (sum > max) { + if (current - start > 1) // extra row + return current; + else // one long row + return current + 1; + } + } + + return size - 1; // return last row pointer +} + template< typename Real, typename Index, typename Device, @@ -1373,33 +1401,31 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, const Index threads = THREADS_PER_BLOCK; /* Fill blocks */ - std::vector inBlock; - inBlock.push_back(0); - Index sum = 0; - Index i, prev_i = 0; - for (i = 1; i < rows - 1; ++i) { - Index elements = matrix.getRowPointers().getElement(i) - - matrix.getRowPointers().getElement(i - 1); - sum += elements; - if (sum > SHARED_PER_WARP) { - if (i - prev_i > 1) // this is extra row - --i; - - inBlock.push_back(i); - sum = 0; - prev_i = i; - } + std::vector> inBlock; + Index start = 0; + inBlock.emplace_back(0); // push start + while (start != rows - 1) { + Index startNext = findLimit(start, SHARED_PER_WARP, matrix, rows); + Index sum = matrix.getRowPointers().getElement(startNext) - + matrix.getRowPointers().getElement(start); + + /* block start is already inserted, +1 */ + Index parts = roundUpDivision(sum, ELEMENTS_PER_WARP); + for (Index warpIndex = 1; warpIndex < parts; ++warpIndex) + inBlock.emplace_back(start, warpIndex); + + inBlock.emplace_back(startNext); + start = startNext; } - inBlock.push_back(rows); /* blocks to GPU */ - Index *blocksAdaptive; - cudaMalloc((void **)&blocksAdaptive, sizeof(Index) * inBlock.size()); - cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(Index), cudaMemcpyHostToDevice); + Block *blocksAdaptive = nullptr; + cudaMalloc((void **)&blocksAdaptive, sizeof(*blocksAdaptive) * inBlock.size()); + cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(*blocksAdaptive), cudaMemcpyHostToDevice); size_t neededThreads = inBlock.size() * 32; // one warp per block /* Execute kernels on device */ - for (Index grid = 0; neededThreads != 0; ++i) { + for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); neededThreads = 0; @@ -1415,7 +1441,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, columnIndexes, values, blocksAdaptive, - inBlock.size() - 1, // -1 here is better than -1 in kernel + inBlock.size() - 1, // last block shouldn't be used getColumns, grid, SHARED_PER_WARP @@ -1575,134 +1601,80 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - /* in vector to GPU */ - Real *kernelInVector; - cudaMalloc((void **)&kernelInVector, sizeof(Real) * inVector.getSize()); - cudaMemcpy(kernelInVector, - (Real *)inVector.getData(), - inVector.getSize() * sizeof(Real), - cudaMemcpyHostToDevice); - - /* out vector to GPU */ - Real *kernelOutVector; - cudaMalloc((void **)&kernelOutVector, sizeof(Real) * outVector.getSize()); - cudaMemcpy(kernelOutVector, - (Real *)outVector.getData(), - outVector.getSize() * sizeof(Real), - cudaMemcpyHostToDevice); - - /* values to GPU */ - Real *kernelValues; - cudaMalloc((void **)&kernelValues, sizeof(Real) * matrix.getValues().getSize()); - cudaMemcpy(kernelValues, - (Real *)matrix.getValues().getData(), - matrix.getValues().getSize() * sizeof(Real), - cudaMemcpyHostToDevice); - - /* columns to GPU */ - Index *kernelColumns; - cudaMalloc((void **)&kernelColumns, sizeof(Index) * matrix.getColumnIndexes().getSize()); - cudaMemcpy(kernelColumns, - (Index *)matrix.getColumnIndexes().getData(), - matrix.getColumnIndexes().getSize() * sizeof(Index), - cudaMemcpyHostToDevice); - - /* row pointers to GPU */ - Index *kernelRowPointers; - cudaMalloc((void **)&kernelRowPointers, sizeof(Index) * matrix.getRowPointers().getSize()); - cudaMemcpy(kernelRowPointers, - (Index *)matrix.getRowPointers().getData(), - matrix.getRowPointers().getSize() * sizeof(Index), - cudaMemcpyHostToDevice); - - switch(KernelType) - { - case CSRScalar: - SpMVCSRScalarPrepare( - kernelInVector, - kernelOutVector, - kernelRowPointers, - kernelColumns, - kernelValues, - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - case CSRVector: - SpMVCSRVectorPrepare( - kernelInVector, - kernelOutVector, - kernelRowPointers, - kernelColumns, - kernelValues, - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - case CSRLight: - SpMVCSRLightPrepare( - kernelInVector, - kernelOutVector, - kernelRowPointers, - kernelColumns, - kernelValues, - matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - case CSRAdaptive: + // switch(KernelType) + // { + // case CSRScalar: + // SpMVCSRScalarPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // case CSRVector: + // SpMVCSRVectorPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // case CSRLight: + // SpMVCSRLightPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getValues().getSize(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // case CSRAdaptive: SpMVCSRAdaptivePrepare( - kernelInVector, - kernelOutVector, + inVector.getData(), + outVector.getData(), matrix, - kernelRowPointers, - kernelColumns, - kernelValues, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, + matrix.getRowPointers().getSize(), matrix.getColumns() ); - break; - case CSRMultiVector: - SpMVCSRMultiVectorPrepare( - kernelInVector, - kernelOutVector, - kernelRowPointers, - kernelColumns, - kernelValues, - matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - case CSRLightWithoutAtomic: - SpMVCSRLightWithoutAtomicPrepare( - kernelInVector, - kernelOutVector, - kernelRowPointers, - kernelColumns, - kernelValues, - matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - } - - /* Copy results */ - cudaMemcpy(outVector.getData(), - kernelOutVector, - outVector.getSize() * sizeof(Real), - cudaMemcpyDeviceToHost); - - /* Free memory */ - cudaFree(kernelInVector); - cudaFree(kernelOutVector); - cudaFree(kernelValues); - cudaFree(kernelColumns); - cudaFree(kernelRowPointers); - + // break; + // case CSRMultiVector: + // SpMVCSRMultiVectorPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getValues().getSize(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // case CSRLightWithoutAtomic: + // SpMVCSRLightWithoutAtomicPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getValues().getSize(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // } #endif /* HAVE_CUDA */ #endif } diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp index 09368b969..333b97371 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp @@ -1391,13 +1391,14 @@ void test_VectorProductCSRAdaptive() using RealType = typename Matrix::RealType; using DeviceType = typename Matrix::DeviceType; using IndexType = typename Matrix::IndexType; + using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; - //----------------- Test CSR Stream part ------------------ - IndexType m_rows = 100; - IndexType m_cols = 100; Matrix m; m.reset(); + IndexType m_rows = 100; + IndexType m_cols = 100; + //----------------- Test CSR Stream part ------------------ m.setDimensions( m_rows, m_cols ); typename Matrix::CompressedRowLengthsVector rowLengths( { @@ -1420,7 +1421,6 @@ void test_VectorProductCSRAdaptive() for (int j = 0; j < m_cols; ++j) m.setElement( i, j, i + 1 ); - using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; VectorType inVector; inVector.setSize( m_rows ); @@ -1437,7 +1437,7 @@ void test_VectorProductCSRAdaptive() for (int i = 0; i < m_rows; ++i) EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 ); - //----------------- Test CSR Dynamic Vector part ------------------ + //----------------- Test CSR Vector L part ------------------ m_rows = 1; // if less than 'max elements per block to start CSR Dynamic Vector' tests CSR Vector part @@ -1450,20 +1450,20 @@ void test_VectorProductCSRAdaptive() m.setCompressedRowLengths( rowLengths2 ); for (int i = 0; i < m_cols; ++i) - m.setElement( 0, i, 2 ); + m.setElement( 0, i, i ); VectorType inVector2; inVector2.setSize( m_cols ); for( IndexType i = 0; i < inVector2.getSize(); i++ ) - inVector2.setElement( i, 1 ); + inVector2.setElement( i, 2 ); - VectorType outVector2; + VectorType outVector2; outVector2.setSize( m_rows ); for( IndexType i = 0; i < outVector2.getSize(); ++i ) outVector2.setElement( i, 0 ); m.vectorProduct(inVector2, outVector2); - EXPECT_EQ( outVector2.getElement( 0 ), 6000 ); + EXPECT_EQ( outVector2.getElement( 0 ), 8997000 ); } template< typename Matrix > -- GitLab From d6ee7cc18c9c50fb0ec84da40b86d9e2caf9a675 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sun, 5 Jul 2020 16:44:01 +0200 Subject: [PATCH 24/57] Compilation error fix --- src/TNL/Matrices/Legacy/CSR_impl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 7f627e7b9..b22fe9663 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -1058,6 +1058,7 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector, const Index gridID) { const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index row = index / groupSize; + Index i; if (row >= rows) return; @@ -1067,7 +1068,7 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector, const Index maxID = rowPointers[row + 1]; Real result = 0.0; - for (Index i = minID + inGroupID; i < maxID; i += groupSize) { + for (i = minID + inGroupID; i < maxID; i += groupSize) { const Index column = columnIndexes[i]; if (column >= getColumns) break; -- GitLab From 09bd0a0b8c927e7e56943f40e8d47888f6df593e Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sun, 5 Jul 2020 20:17:46 +0200 Subject: [PATCH 25/57] Uncommented kernels --- src/TNL/Matrices/Legacy/CSR_impl.h | 128 ++++++++++++++--------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index b22fe9663..33c84de40 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -815,7 +815,7 @@ void SpMVCSRAdaptive( const Real *inVector, /////////////////////////////////////* CSR VECTOR L *///////////// const Index warpInRow = blocks[blockIdx].index; if (elements == 0) maxID = rowPointers[minRow + 1]; - + offset = warpInRow * ELEMENTS_PER_WARP; to = minID + (warpInRow + 1) * ELEMENTS_PER_WARP; if (to > maxID) to = maxID; @@ -1602,43 +1602,43 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - // switch(KernelType) - // { - // case CSRScalar: - // SpMVCSRScalarPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // case CSRVector: - // SpMVCSRVectorPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // case CSRLight: - // SpMVCSRLightPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getValues().getSize(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // case CSRAdaptive: + switch(KernelType) + { + case CSRScalar: + SpMVCSRScalarPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRVector: + SpMVCSRVectorPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRLight: + SpMVCSRLightPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getValues().getSize(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRAdaptive: SpMVCSRAdaptivePrepare( inVector.getData(), outVector.getData(), @@ -1650,32 +1650,32 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getRowPointers().getSize(), matrix.getColumns() ); - // break; - // case CSRMultiVector: - // SpMVCSRMultiVectorPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getValues().getSize(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // case CSRLightWithoutAtomic: - // SpMVCSRLightWithoutAtomicPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getValues().getSize(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // } + break; + case CSRMultiVector: + SpMVCSRMultiVectorPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getValues().getSize(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRLightWithoutAtomic: + SpMVCSRLightWithoutAtomicPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getValues().getSize(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + } #endif /* HAVE_CUDA */ #endif } -- GitLab From 96a571b303fd5b5b9a85d49b3822d26acd92b5ba Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Mon, 6 Jul 2020 00:55:58 +0200 Subject: [PATCH 26/57] Optimizations for CSR Adaptive --- src/TNL/Matrices/Legacy/CSR_impl.h | 307 ++++++++++++++++------------- 1 file changed, 166 insertions(+), 141 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 33c84de40..4466008a1 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -23,17 +23,34 @@ #include #endif -template -struct Block { - Block(Index row, Index index = 0) noexcept { - this->index = index; - this->row = row; +enum Type { + STREAM = 0, + VECTOR = 1, + LONG = 2 +}; + +union Block { + Block(uint32_t row, Type type = VECTOR, uint32_t index = 0) noexcept { + this->index[0] = row; + this->index[1] = index; + this->byte[7] = (uint8_t)type; } - Index index; - Index row; + uint32_t index[2]; // index[0] is row pointer, index[1] is index in warp + uint8_t byte[8]; // byte[7] is type specificator }; +// template +// struct Block_old { +// Block(Index row, Index index = 0) noexcept { +// this->index = index; +// this->row = row; +// } + +// Index index; +// Index row; +// }; + /* Configuration */ constexpr size_t MAX_X_DIM = 2147483647; constexpr int ELEMENTS_PER_WARP = 1024; @@ -784,96 +801,94 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& template< typename Real, typename Index, - int warpSize > + int warpSize, + int sharedPerWarp > __global__ void SpMVCSRAdaptive( const Real *inVector, Real *outVector, const Index* rowPointers, const Index* columnIndexes, const Real* values, - Block *blocks, + const Block *blocks, Index blocks_size, Index getColumns, - Index gridID, - const Index sharedPerWarp) { + Index gridID) { __shared__ Real shared_res[49152/sizeof(Real)]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if (blockIdx >= blocks_size) return; + Block block = blocks[blockIdx]; Real result = 0.0; const Index laneID = index % warpSize; - const Index minRow = blocks[blockIdx].row; - const Index maxRow = blocks[blockIdx + 1].row; - const Index minID = rowPointers[minRow]; - Index maxID = rowPointers[maxRow]; - Index i, to, column, offset; - Index elements = maxID - minID; - /* rows per block more than 1 */ - if (elements == 0 || elements > ELEMENTS_PER_WARP) { - /////////////////////////////////////* CSR VECTOR L *///////////// - const Index warpInRow = blocks[blockIdx].index; - if (elements == 0) maxID = rowPointers[minRow + 1]; + const Index minID = rowPointers[block.index[0]/* minRow */]; + Index i, to, column, offset, maxID; + if (block.byte[7] == 0) { + /////////////////////////////////////* CSR STREAM *////////////// + const Index maxRow = blocks[blockIdx + 1].index[0]; + maxID = rowPointers[maxRow]; + /* offset between shared and global addresses */ + offset = minID - (threadIdx.x / warpSize * sharedPerWarp); + /* Copy and calculate elements from global to shared memory, coalesced */ + for (i = laneID + minID; i < maxID; i += warpSize) { + column = columnIndexes[i]; + if (column >= getColumns) + continue; // can't be break + shared_res[i - offset] = values[i] * inVector[column]; + } - offset = warpInRow * ELEMENTS_PER_WARP; - to = minID + (warpInRow + 1) * ELEMENTS_PER_WARP; - if (to > maxID) to = maxID; - - for (i = minID + offset + laneID; i < to; i += warpSize) { + /* Calculate result */ + for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { + to = rowPointers[i + 1] - offset; // end of preprocessed data + result = 0; + /* Scalar reduction */ + for (Index sharedID = rowPointers[i] - offset; sharedID < to; ++sharedID) + result += shared_res[sharedID]; + + outVector[i] = result; // Write result + } + } else if (block.byte[7] == 1) { + /////////////////////////////////////* CSR VECTOR *////////////// + maxID = rowPointers[block.index[0]/* minRow */ + 1]; + + for (i = minID + laneID; i < maxID; i += warpSize) { column = columnIndexes[i]; if (column >= getColumns) break; result += values[i] * inVector[column]; } - /* Parallel reduction */ result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); - if (laneID == 0) atomicAdd(&outVector[minRow], result); - } else if (elements <= sharedPerWarp) { - /////////////////////////////////////* CSR STREAM *////////////// - /* Copy and calculate elements from global to shared memory, coalesced */ - offset = threadIdx.x / warpSize * sharedPerWarp; - Index elementID = laneID + minID; - Index sharedID = laneID + offset; // index for shared memory - for (; elementID < maxID; elementID += warpSize, sharedID += warpSize) { - column = columnIndexes[elementID]; - if (column >= getColumns) - continue; // can't be break - shared_res[sharedID] = values[elementID] * inVector[column]; - } + if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result + } else { + /////////////////////////////////////* CSR VECTOR L *///////////// + maxID = rowPointers[block.index[0]/* minRow */ + 1]; - /* Calculate result */ - for (Index row = minRow + laneID; row < maxRow; row += warpSize) { - to = rowPointers[row + 1] - minID + offset; // end of preprocessed data - result = 0; - /* Scalar reduction */ - for (sharedID = rowPointers[row] - minID + offset; sharedID < to; ++sharedID) - result += shared_res[sharedID]; + offset = block.index[1]/* warpInRow */ * ELEMENTS_PER_WARP; + to = minID + (block.index[1]/* warpInRow */ + 1) * ELEMENTS_PER_WARP; + if (to > maxID) to = maxID; - outVector[row] = result; // Write result - } - } else { - /////////////////////////////////////* CSR VECTOR *////////////// - for (i = minID + laneID; i < maxID; i += warpSize) { + for (i = minID + offset + laneID; i < to; i += warpSize) { column = columnIndexes[i]; if (column >= getColumns) break; result += values[i] * inVector[column]; } + /* Parallel reduction */ result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); - if (laneID == 0) outVector[minRow] = result; // Write result + if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result); } } @@ -1359,20 +1374,29 @@ template< typename Real, CSRKernel KernelType> Index findLimit(const Index start, const Index max, const CSR< Real, Device, Index, KernelType >& matrix, - const Index size) { - Index sum = 0; + const Index size, + Type &type, + Index &sum) { + sum = 0; for (Index current = start; current < size - 1; ++current) { Index elements = matrix.getRowPointers().getElement(current + 1) - matrix.getRowPointers().getElement(current); sum += elements; if (sum > max) { - if (current - start > 1) // extra row + if (current - start > 1) { // extra row + type = STREAM; return current; - else // one long row + } else { // one long row + if (sum <= ELEMENTS_PER_WARP) + type = VECTOR; + else + type = LONG; return current + 1; + } } } + type = STREAM; return size - 1; // return last row pointer } @@ -1398,29 +1422,31 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, constexpr Index SHARED = 49152/sizeof(Real); constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; //-------------------------------------------------------------------- - Index blocks; + Index blocks, sum, start = 0, nextStart = 0; const Index threads = THREADS_PER_BLOCK; /* Fill blocks */ - std::vector> inBlock; - Index start = 0; - inBlock.emplace_back(0); // push start - while (start != rows - 1) { - Index startNext = findLimit(start, SHARED_PER_WARP, matrix, rows); - Index sum = matrix.getRowPointers().getElement(startNext) - - matrix.getRowPointers().getElement(start); - - /* block start is already inserted, +1 */ - Index parts = roundUpDivision(sum, ELEMENTS_PER_WARP); - for (Index warpIndex = 1; warpIndex < parts; ++warpIndex) - inBlock.emplace_back(start, warpIndex); - - inBlock.emplace_back(startNext); - start = startNext; + std::vector inBlock; + inBlock.reserve(rows); // resere space to avoid reallocation + + while (nextStart != rows - 1) { + Type type; + nextStart = findLimit(start, SHARED_PER_WARP, matrix, rows, type, sum); + if (type == LONG) { + uint32_t parts = roundUpDivision(sum, ELEMENTS_PER_WARP); + for (uint32_t index = 0; index < parts; ++index) { + inBlock.emplace_back(start, LONG, index); + } + } else { + inBlock.emplace_back(start, type); + } + + start = nextStart; } + inBlock.emplace_back(nextStart); /* blocks to GPU */ - Block *blocksAdaptive = nullptr; + Block *blocksAdaptive = nullptr; cudaMalloc((void **)&blocksAdaptive, sizeof(*blocksAdaptive) * inBlock.size()); cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(*blocksAdaptive), cudaMemcpyHostToDevice); @@ -1435,7 +1461,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, neededThreads -= MAX_X_DIM * threads; } - SpMVCSRAdaptive<<>>( + SpMVCSRAdaptive<<>>( inVector, outVector, rowPointers, @@ -1444,8 +1470,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, blocksAdaptive, inBlock.size() - 1, // last block shouldn't be used getColumns, - grid, - SHARED_PER_WARP + grid ); } @@ -1602,43 +1627,43 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - switch(KernelType) - { - case CSRScalar: - SpMVCSRScalarPrepare( - inVector.getData(), - outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - case CSRVector: - SpMVCSRVectorPrepare( - inVector.getData(), - outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - case CSRLight: - SpMVCSRLightPrepare( - inVector.getData(), - outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - case CSRAdaptive: + // switch(KernelType) + // { + // case CSRScalar: + // SpMVCSRScalarPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // case CSRVector: + // SpMVCSRVectorPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // case CSRLight: + // SpMVCSRLightPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getValues().getSize(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // case CSRAdaptive: SpMVCSRAdaptivePrepare( inVector.getData(), outVector.getData(), @@ -1650,32 +1675,32 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getRowPointers().getSize(), matrix.getColumns() ); - break; - case CSRMultiVector: - SpMVCSRMultiVectorPrepare( - inVector.getData(), - outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - case CSRLightWithoutAtomic: - SpMVCSRLightWithoutAtomicPrepare( - inVector.getData(), - outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - } + // break; + // case CSRMultiVector: + // SpMVCSRMultiVectorPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getValues().getSize(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // case CSRLightWithoutAtomic: + // SpMVCSRLightWithoutAtomicPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getValues().getSize(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // } #endif /* HAVE_CUDA */ #endif } -- GitLab From 91ed0ebe057e1cba5431d5950a4697c4c8f79ded Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Mon, 6 Jul 2020 01:00:14 +0200 Subject: [PATCH 27/57] Uncommented kernels --- src/TNL/Matrices/Legacy/CSR_impl.h | 126 ++++++++++++++--------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 4466008a1..e4fe9c083 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -1627,43 +1627,43 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - // switch(KernelType) - // { - // case CSRScalar: - // SpMVCSRScalarPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // case CSRVector: - // SpMVCSRVectorPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // case CSRLight: - // SpMVCSRLightPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getValues().getSize(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // case CSRAdaptive: + switch(KernelType) + { + case CSRScalar: + SpMVCSRScalarPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRVector: + SpMVCSRVectorPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRLight: + SpMVCSRLightPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getValues().getSize(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRAdaptive: SpMVCSRAdaptivePrepare( inVector.getData(), outVector.getData(), @@ -1675,32 +1675,32 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getRowPointers().getSize(), matrix.getColumns() ); - // break; - // case CSRMultiVector: - // SpMVCSRMultiVectorPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getValues().getSize(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // case CSRLightWithoutAtomic: - // SpMVCSRLightWithoutAtomicPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getValues().getSize(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // } + break; + case CSRMultiVector: + SpMVCSRMultiVectorPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getValues().getSize(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRLightWithoutAtomic: + SpMVCSRLightWithoutAtomicPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getValues().getSize(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + } #endif /* HAVE_CUDA */ #endif } -- GitLab From c8ee6a280302c741827e7f2cdd6d731668ef5e2a Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Mon, 6 Jul 2020 18:50:04 +0200 Subject: [PATCH 28/57] Optimizations for all kernels --- src/TNL/Matrices/Legacy/CSR_impl.h | 172 +++++++++++------------------ 1 file changed, 65 insertions(+), 107 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index e4fe9c083..5b821f03c 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -40,17 +40,6 @@ union Block { uint8_t byte[8]; // byte[7] is type specificator }; -// template -// struct Block_old { -// Block(Index row, Index index = 0) noexcept { -// this->index = index; -// this->row = row; -// } - -// Index index; -// Index row; -// }; - /* Configuration */ constexpr size_t MAX_X_DIM = 2147483647; constexpr int ELEMENTS_PER_WARP = 1024; @@ -860,11 +849,11 @@ void SpMVCSRAdaptive( const Real *inVector, result += values[i] * inVector[column]; } /* Parallel reduction */ - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result } else { /////////////////////////////////////* CSR VECTOR L *///////////// @@ -883,18 +872,17 @@ void SpMVCSRAdaptive( const Real *inVector, } /* Parallel reduction */ - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result); } } template< typename Real, - typename Index, - int warpSize > + typename Index> __global__ void SpMVCSRScalar( const Real *inVector, Real* outVector, @@ -908,11 +896,12 @@ void SpMVCSRScalar( const Real *inVector, if (index >= rows) return; + Index column; Real result = 0.0; const Index endID = rowPointers[index + 1]; for (Index i = rowPointers[index]; i < endID; ++i) { - const Index column = columnIndexes[i]; + column = columnIndexes[i]; if (column >= getColumns) break; @@ -933,7 +922,6 @@ void SpMVCSRMultiVector( const Real *inVector, const Real* values, const Index rows, const Index getColumns, - const Index perWarp, const Index offset, const Index gridID) { @@ -945,11 +933,12 @@ void SpMVCSRMultiVector( const Real *inVector, const Index inRowID = index % offset; Real result = 0.0; + Index column; Index endID = rowPointers[rowID + 1]; /* Calculate result */ for (Index i = rowPointers[rowID] + inRowID; i < endID; i += offset) { - Index column = columnIndexes[i]; + column = columnIndexes[i]; if (column >= getColumns) break; @@ -957,11 +946,11 @@ void SpMVCSRMultiVector( const Real *inVector, } /* Reduction */ - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); /* Write result */ if (index % warpSize == 0) atomicAdd(&outVector[rowID], result); } @@ -984,13 +973,14 @@ void SpMVCSRVector( const Real *inVector, if (warpID >= rows) return; - const Index laneID = index % warpSize; Real result = 0.0; + Index column; + const Index laneID = index % warpSize; Index endID = rowPointers[warpID + 1]; /* Calculate result */ for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) { - Index column = columnIndexes[i]; + column = columnIndexes[i]; if (column >= getColumns) break; @@ -998,18 +988,17 @@ void SpMVCSRVector( const Real *inVector, } /* Reduction */ - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); /* Write result */ if (laneID == 0) outVector[warpID] = result; } template< typename Real, - typename Index, - int warpSize > + typename Index > __global__ void SpMVCSRLight( const Real *inVector, Real* outVector, @@ -1019,13 +1008,11 @@ void SpMVCSRLight( const Real *inVector, const Index rows, const Index getColumns, const Index groupSize, - const Index gridID, unsigned *rowCnt) { - const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const Index laneID = index % warpSize; - const Index groupID = laneID / groupSize; - const Index inGroupID = laneID % groupSize; - Index row, minID, maxID, i; + const Index groupID = threadIdx.x / groupSize; + const Index inGroupID = threadIdx.x % groupSize; + Index row, maxID, i; + Real result; while (true) { @@ -1033,15 +1020,14 @@ void SpMVCSRLight( const Real *inVector, if (inGroupID == 0) row = atomicAdd(rowCnt, 1); /* Propagate row number in group */ - row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * groupSize); + row = __shfl_sync(0xFFFFFFFF, row, groupID * groupSize); if (row >= rows) return; - minID = rowPointers[row]; maxID = rowPointers[row + 1]; - Real result = 0.0; - for (i = minID + inGroupID; i < maxID; i += groupSize) { + result = 0.0; + for (i = rowPointers[row] + inGroupID; i < maxID; i += groupSize) { const Index column = columnIndexes[i]; if (column >= getColumns) break; @@ -1050,8 +1036,8 @@ void SpMVCSRLight( const Real *inVector, } /* Parallel reduction */ - for (i = groupSize / 2; i > 0; i /= 2) - result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); + for (i = groupSize >> 1; i > 0; i >>= 1) + result += __shfl_down_sync(0xFFFFFFFF, result, i); /* Write result */ if (inGroupID == 0) outVector[row] = result; @@ -1059,8 +1045,7 @@ void SpMVCSRLight( const Real *inVector, } template< typename Real, - typename Index, - int warpSize > + typename Index> __global__ void SpMVCSRLightWithoutAtomic( const Real *inVector, Real* outVector, @@ -1073,18 +1058,17 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector, const Index gridID) { const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index row = index / groupSize; - Index i; + Index i, column; if (row >= rows) return; const Index inGroupID = index % groupSize; - const Index minID = rowPointers[row]; const Index maxID = rowPointers[row + 1]; Real result = 0.0; - for (i = minID + inGroupID; i < maxID; i += groupSize) { - const Index column = columnIndexes[i]; + for (i = rowPointers[row] + inGroupID; i < maxID; i += groupSize) { + column = columnIndexes[i]; if (column >= getColumns) break; @@ -1092,8 +1076,8 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector, } /* Parallel reduction */ - for (i = groupSize / 2; i > 0; i /= 2) - result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); + for (i = groupSize >> 1; i > 0; i >>= 1) + result += __shfl_down_sync(0xFFFFFFFF, result, i); /* Write result */ if (inGroupID == 0) outVector[row] = result; @@ -1122,7 +1106,7 @@ void SpMVCSRScalarPrepare( const Real *inVector, neededThreads -= MAX_X_DIM * threads; } - SpMVCSRScalar<<>>( + SpMVCSRScalar<<>>( inVector, outVector, rowPointers, @@ -1182,64 +1166,39 @@ void SpMVCSRLightPrepare( const Real *inVector, const Index valuesSize, const Index rows, const Index getColumns) { - const Index threads = 1024; // block size + const Index threads = 1024; // max block size Index blocks, groupSize; /* Copy rowCnt to GPU */ unsigned rowCnt = 0; - unsigned *kernelRowCnt; + unsigned *kernelRowCnt = nullptr; cudaMalloc((void **)&kernelRowCnt, sizeof(*kernelRowCnt)); cudaMemcpy(kernelRowCnt, &rowCnt, sizeof(*kernelRowCnt), cudaMemcpyHostToDevice); - + cudaDeviceProp properties; + cudaGetDeviceProperties( &properties, Cuda::DeviceInfo::getActiveDevice() ); + blocks = properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads; + const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row if (nnz <= 2) groupSize = 2; else if (nnz <= 4) groupSize = 4; - else if (nnz <= 8) + else if (nnz <= 64) groupSize = 8; - else if (nnz <= 16) - groupSize = 16; else groupSize = 32; - size_t neededThreads = groupSize * rows; - /* Execute kernels on device */ - for (Index grid = 0; neededThreads != 0; ++grid) { - if (MAX_X_DIM * threads >= neededThreads) { - blocks = roundUpDivision(neededThreads, threads); - neededThreads = 0; - } else { - blocks = MAX_X_DIM; - neededThreads -= MAX_X_DIM * threads; - } - - if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector - SpMVCSRVector<<>>( - inVector, - outVector, - rowPointers, - columnIndexes, - values, - rows, - getColumns, - grid - ); - } else { - SpMVCSRLight<<>>( - inVector, - outVector, - rowPointers, - columnIndexes, - values, - rows, - getColumns, - groupSize, - grid, - kernelRowCnt - ); - } - } + SpMVCSRLight<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + rows, + getColumns, + groupSize, + kernelRowCnt + ); cudaFree(kernelRowCnt); } @@ -1294,7 +1253,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, grid ); } else { - SpMVCSRLightWithoutAtomic<<>>( + SpMVCSRLightWithoutAtomic<<>>( inVector, outVector, rowPointers, @@ -1359,7 +1318,6 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, values, rows, getColumns, - ELEMENTS_PER_WARP, offset, grid ); @@ -1427,7 +1385,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, /* Fill blocks */ std::vector inBlock; - inBlock.reserve(rows); // resere space to avoid reallocation + inBlock.reserve(rows); // reserve space to avoid reallocation while (nextStart != rows - 1) { Type type; -- GitLab From 466e013637a7f426ff6bfbb114857ad111a8383b Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Tue, 7 Jul 2020 20:42:57 +0200 Subject: [PATCH 29/57] Divided CSR LightWithoutAtomic by 4 kernels --- src/TNL/Matrices/Legacy/CSR_impl.h | 193 +++++++++++++++++++++++------ 1 file changed, 153 insertions(+), 40 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 5b821f03c..2de0becef 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -933,12 +933,11 @@ void SpMVCSRMultiVector( const Real *inVector, const Index inRowID = index % offset; Real result = 0.0; - Index column; Index endID = rowPointers[rowID + 1]; /* Calculate result */ for (Index i = rowPointers[rowID] + inRowID; i < endID; i += offset) { - column = columnIndexes[i]; + Index column = columnIndexes[i]; if (column >= getColumns) break; @@ -974,13 +973,12 @@ void SpMVCSRVector( const Real *inVector, return; Real result = 0.0; - Index column; const Index laneID = index % warpSize; Index endID = rowPointers[warpID + 1]; /* Calculate result */ for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) { - column = columnIndexes[i]; + Index column = columnIndexes[i]; if (column >= getColumns) break; @@ -1047,27 +1045,138 @@ void SpMVCSRLight( const Real *inVector, template< typename Real, typename Index> __global__ -void SpMVCSRLightWithoutAtomic( const Real *inVector, - Real* outVector, - const Index* rowPointers, - const Index* columnIndexes, - const Real* values, - const Index rows, - const Index getColumns, - const Index groupSize, - const Index gridID) { +void SpMVCSRLightWithoutAtomic2( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index getColumns, + const Index gridID) { const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const Index row = index / groupSize; + const Index row = index / 2; + + if (row >= rows) + return; + + const Index inGroupID = index % 2; + const Index maxID = rowPointers[row + 1]; + + Real result = 0.0; + for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 2) { + Index column = columnIndexes[i]; + if (column >= getColumns) + break; + + result += values[i] * inVector[column]; + } + + /* Parallel reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + + /* Write result */ + if (inGroupID == 0) outVector[row] = result; +} + +template< typename Real, + typename Index> +__global__ +void SpMVCSRLightWithoutAtomic4( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index getColumns, + const Index gridID) { + const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index row = index / 4; + + if (row >= rows) + return; + + const Index inGroupID = index % 4; + const Index maxID = rowPointers[row + 1]; + + Real result = 0.0; + for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 4) { + Index column = columnIndexes[i]; + if (column >= getColumns) + break; + + result += values[i] * inVector[column]; + } + + /* Parallel reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + + /* Write result */ + if (inGroupID == 0) outVector[row] = result; +} + +template< typename Real, + typename Index> +__global__ +void SpMVCSRLightWithoutAtomic8( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index getColumns, + const Index gridID) { + const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index row = index / 8; + Index i, column; + + if (row >= rows) + return; + + const Index inGroupID = index % 8; + const Index maxID = rowPointers[row + 1]; + + Real result = 0.0; + for (i = rowPointers[row] + inGroupID; i < maxID; i += 8) { + column = columnIndexes[i]; + if (column >= getColumns) + break; + + result += values[i] * inVector[column]; + } + + /* Parallel reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + + /* Write result */ + if (inGroupID == 0) outVector[row] = result; +} + +template< typename Real, + typename Index> +__global__ +void SpMVCSRLightWithoutAtomic16( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index getColumns, + const Index gridID) { + const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index row = index / 16; Index i, column; if (row >= rows) return; - const Index inGroupID = index % groupSize; + const Index inGroupID = index % 16; const Index maxID = rowPointers[row + 1]; Real result = 0.0; - for (i = rowPointers[row] + inGroupID; i < maxID; i += groupSize) { + for (i = rowPointers[row] + inGroupID; i < maxID; i += 16) { column = columnIndexes[i]; if (column >= getColumns) break; @@ -1076,8 +1185,10 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector, } /* Parallel reduction */ - for (i = groupSize >> 1; i > 0; i >>= 1) - result += __shfl_down_sync(0xFFFFFFFF, result, i); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); /* Write result */ if (inGroupID == 0) outVector[row] = result; @@ -1241,28 +1352,30 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, neededThreads -= MAX_X_DIM * threads; } - if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector - SpMVCSRVector<<>>( - inVector, - outVector, - rowPointers, - columnIndexes, - values, - rows, - getColumns, - grid + if (groupSize == 2) { + SpMVCSRLightWithoutAtomic2<<>>( + inVector, outVector, rowPointers, columnIndexes, values, + rows, getColumns, grid ); - } else { - SpMVCSRLightWithoutAtomic<<>>( - inVector, - outVector, - rowPointers, - columnIndexes, - values, - rows, - getColumns, - groupSize, - grid + } else if (groupSize == 4) { + SpMVCSRLightWithoutAtomic4<<>>( + inVector, outVector, rowPointers, columnIndexes, values, + rows, getColumns, grid + ); + } else if (groupSize == 8) { + SpMVCSRLightWithoutAtomic8<<>>( + inVector, outVector, rowPointers, columnIndexes, values, + rows, getColumns, grid + ); + } else if (groupSize == 16) { + SpMVCSRLightWithoutAtomic16<<>>( + inVector, outVector, rowPointers, columnIndexes, values, + rows, getColumns, grid + ); + } else { // CSR SpMV Light with groupsize = 32 is CSR Vector + SpMVCSRVector<<>>( + inVector, outVector, rowPointers, columnIndexes, values, + rows, getColumns, grid ); } } @@ -1630,7 +1743,7 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.getValues().getSize(), - matrix.getRowPointers().getSize(), + matrix.getRowPointers().getSize(), // don't add -1 ! matrix.getColumns() ); break; -- GitLab From e7b4ef8b4263fd11852a898f812ef977cff5dafa Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Wed, 8 Jul 2020 21:16:10 +0200 Subject: [PATCH 30/57] Bug fix for CSR MultiVector, optimizations for CSR LightWithoutAtomic --- src/TNL/Matrices/Legacy/CSR_impl.h | 146 +++++++++++++++-------------- 1 file changed, 77 insertions(+), 69 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 2de0becef..bdd0fa406 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -42,7 +42,6 @@ union Block { /* Configuration */ constexpr size_t MAX_X_DIM = 2147483647; -constexpr int ELEMENTS_PER_WARP = 1024; //----------------------------------------------------------------- namespace TNL { @@ -791,7 +790,8 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& template< typename Real, typename Index, int warpSize, - int sharedPerWarp > + int sharedPerWarp, + int maxElemPerWarp > __global__ void SpMVCSRAdaptive( const Real *inVector, Real *outVector, @@ -799,18 +799,18 @@ void SpMVCSRAdaptive( const Real *inVector, const Index* columnIndexes, const Real* values, const Block *blocks, - Index blocks_size, + Index blocksSize, Index getColumns, Index gridID) { __shared__ Real shared_res[49152/sizeof(Real)]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; - if (blockIdx >= blocks_size) + if (blockIdx >= blocksSize) return; Block block = blocks[blockIdx]; Real result = 0.0; - const Index laneID = index % warpSize; + const Index laneID = threadIdx.x % warpSize; const Index minID = rowPointers[block.index[0]/* minRow */]; Index i, to, column, offset, maxID; if (block.byte[7] == 0) { @@ -859,8 +859,8 @@ void SpMVCSRAdaptive( const Real *inVector, /////////////////////////////////////* CSR VECTOR L *///////////// maxID = rowPointers[block.index[0]/* minRow */ + 1]; - offset = block.index[1]/* warpInRow */ * ELEMENTS_PER_WARP; - to = minID + (block.index[1]/* warpInRow */ + 1) * ELEMENTS_PER_WARP; + offset = block.index[1]/* warpInRow */ * maxElemPerWarp; + to = minID + (block.index[1]/* warpInRow */ + 1) * maxElemPerWarp; if (to > maxID) to = maxID; for (i = minID + offset + laneID; i < to; i += warpSize) { @@ -892,15 +892,15 @@ void SpMVCSRScalar( const Real *inVector, const Index rows, const Index getColumns, const Index gridID) { - const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - if (index >= rows) + const Index row = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + if (row >= rows) return; Index column; Real result = 0.0; - const Index endID = rowPointers[index + 1]; + const Index endID = rowPointers[row + 1]; - for (Index i = rowPointers[index]; i < endID; ++i) { + for (Index i = rowPointers[row]; i < endID; ++i) { column = columnIndexes[i]; if (column >= getColumns) break; @@ -908,7 +908,7 @@ void SpMVCSRScalar( const Real *inVector, result += values[i] * inVector[column]; } - outVector[index] = result; + outVector[row] = result; } template< typename Real, @@ -922,21 +922,23 @@ void SpMVCSRMultiVector( const Real *inVector, const Real* values, const Index rows, const Index getColumns, - const Index offset, + const Index warps, // warps per row const Index gridID) { - const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const Index rowID = index / offset; + const Index warpID = + ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; + const Index rowID = warpID / warps; if (rowID >= rows) return; - const Index inRowID = index % offset; + const Index laneID = threadIdx.x % warpSize; + const Index offset = warps * warpSize; Real result = 0.0; Index endID = rowPointers[rowID + 1]; - /* Calculate result */ - for (Index i = rowPointers[rowID] + inRowID; i < endID; i += offset) { + for (Index i = rowPointers[rowID] + (warpID % warps) * warpSize + laneID; + i < endID; i += offset) { Index column = columnIndexes[i]; if (column >= getColumns) break; @@ -951,7 +953,7 @@ void SpMVCSRMultiVector( const Real *inVector, result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); /* Write result */ - if (index % warpSize == 0) atomicAdd(&outVector[rowID], result); + if (laneID == 0) atomicAdd(&outVector[rowID], result); } template< typename Real, @@ -967,13 +969,12 @@ void SpMVCSRVector( const Real *inVector, const Index getColumns, const Index gridID) { - const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const Index warpID = index / warpSize; + const Index warpID = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; if (warpID >= rows) return; Real result = 0.0; - const Index laneID = index % warpSize; + const Index laneID = threadIdx.x % warpSize; Index endID = rowPointers[warpID + 1]; /* Calculate result */ @@ -1017,7 +1018,7 @@ void SpMVCSRLight( const Real *inVector, /* Get row number */ if (inGroupID == 0) row = atomicAdd(rowCnt, 1); - /* Propagate row number in group */ + /* share row number in group */ row = __shfl_sync(0xFFFFFFFF, row, groupID * groupSize); if (row >= rows) return; @@ -1053,13 +1054,12 @@ void SpMVCSRLightWithoutAtomic2( const Real *inVector, const Index rows, const Index getColumns, const Index gridID) { - const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const Index row = index / 2; - + const Index row = + ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 2; if (row >= rows) return; - const Index inGroupID = index % 2; + const Index inGroupID = threadIdx.x % 2; const Index maxID = rowPointers[row + 1]; Real result = 0.0; @@ -1089,13 +1089,12 @@ void SpMVCSRLightWithoutAtomic4( const Real *inVector, const Index rows, const Index getColumns, const Index gridID) { - const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const Index row = index / 4; - + const Index row = + ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4; if (row >= rows) return; - const Index inGroupID = index % 4; + const Index inGroupID = threadIdx.x % 4; const Index maxID = rowPointers[row + 1]; Real result = 0.0; @@ -1126,14 +1125,13 @@ void SpMVCSRLightWithoutAtomic8( const Real *inVector, const Index rows, const Index getColumns, const Index gridID) { - const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const Index row = index / 8; - Index i, column; - + const Index row = + ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8; if (row >= rows) return; - const Index inGroupID = index % 8; + Index i, column; + const Index inGroupID = threadIdx.x % 8; const Index maxID = rowPointers[row + 1]; Real result = 0.0; @@ -1165,14 +1163,14 @@ void SpMVCSRLightWithoutAtomic16( const Real *inVector, const Index rows, const Index getColumns, const Index gridID) { - const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const Index row = index / 16; - Index i, column; - + const Index row = + ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 16; if (row >= rows) return; - const Index inGroupID = index % 16; + + Index i, column; + const Index inGroupID = threadIdx.x % 16; const Index maxID = rowPointers[row + 1]; Real result = 0.0; @@ -1195,8 +1193,7 @@ void SpMVCSRLightWithoutAtomic16( const Real *inVector, } template< typename Real, - typename Index, - int warpSize > + typename Index > void SpMVCSRScalarPrepare( const Real *inVector, Real* outVector, const Index* rowPointers, @@ -1267,8 +1264,7 @@ void SpMVCSRVectorPrepare( const Real *inVector, } template< typename Real, - typename Index, - int warpSize > + typename Index > void SpMVCSRLightPrepare( const Real *inVector, Real* outVector, const Index* rowPointers, @@ -1278,7 +1274,7 @@ void SpMVCSRLightPrepare( const Real *inVector, const Index rows, const Index getColumns) { const Index threads = 1024; // max block size - Index blocks, groupSize; + Index groupSize; /* Copy rowCnt to GPU */ unsigned rowCnt = 0; unsigned *kernelRowCnt = nullptr; @@ -1287,7 +1283,8 @@ void SpMVCSRLightPrepare( const Real *inVector, cudaDeviceProp properties; cudaGetDeviceProperties( &properties, Cuda::DeviceInfo::getActiveDevice() ); - blocks = properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads; + Index blocks = + properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads; const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row if (nnz <= 2) @@ -1316,7 +1313,8 @@ void SpMVCSRLightPrepare( const Real *inVector, template< typename Real, typename Index, - int warpSize > + int warpSize, + int maxElemPerWarp > void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, Real* outVector, const Index* rowPointers, @@ -1338,8 +1336,10 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, groupSize = 8; else if (nnz <= 16) groupSize = 16; + else if (nnz <= maxElemPerWarp) + groupSize = 32; // CSR Vector else - groupSize = 32; + groupSize = roundUpDivision(nnz, maxElemPerWarp) * 32; // CSR MultiVector neededThreads = groupSize * rows; /* Execute kernels on device */ @@ -1372,18 +1372,24 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, inVector, outVector, rowPointers, columnIndexes, values, rows, getColumns, grid ); - } else { // CSR SpMV Light with groupsize = 32 is CSR Vector + } else if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector SpMVCSRVector<<>>( inVector, outVector, rowPointers, columnIndexes, values, rows, getColumns, grid ); + } else { // Execute CSR MultiVector + SpMVCSRMultiVector<<>>( + inVector, outVector, rowPointers, columnIndexes, values, + rows, getColumns, groupSize / 32, grid + ); } } } template< typename Real, typename Index, - int warpSize > + int warpSize, + int maxElemPerWarp> void SpMVCSRMultiVectorPrepare( const Real *inVector, Real* outVector, const Index* rowPointers, @@ -1398,9 +1404,8 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, Index blocks; const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row - const size_t neededWarps = roundUpDivision(nnz, ELEMENTS_PER_WARP); // warps per row - const Index offset = neededWarps * ELEMENTS_PER_WARP; - size_t neededThreads = offset * rows; + const Index neededWarps = roundUpDivision(nnz, maxElemPerWarp); // warps per row + size_t neededThreads = warpSize * neededWarps * rows; /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { @@ -1431,7 +1436,7 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, values, rows, getColumns, - offset, + neededWarps, grid ); } @@ -1442,7 +1447,8 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, template< typename Real, typename Index, typename Device, - CSRKernel KernelType> + CSRKernel KernelType, + int maxElemPerWarp> Index findLimit(const Index start, const Index max, const CSR< Real, Device, Index, KernelType >& matrix, const Index size, @@ -1458,7 +1464,7 @@ Index findLimit(const Index start, const Index max, type = STREAM; return current; } else { // one long row - if (sum <= ELEMENTS_PER_WARP) + if (sum <= maxElemPerWarp) type = VECTOR; else type = LONG; @@ -1475,7 +1481,8 @@ template< typename Real, typename Index, typename Device, CSRKernel KernelType, - int warpSize > + int warpSize, + int maxElemPerWarp > void SpMVCSRAdaptivePrepare( const Real *inVector, Real* outVector, const CSR< Real, Device, Index, KernelType >& matrix, @@ -1488,10 +1495,9 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, /* Configuration ---------------------------------------------------*/ /* Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache 512 threads per block for double (12 elements per thread) */ - constexpr size_t THREADS_PER_BLOCK = sizeof(Real) == 4 ? 1024 : 512; + constexpr Index THREADS_PER_BLOCK = sizeof(Real) == 4 ? 1024 : 512; constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32; - constexpr Index SHARED = 49152/sizeof(Real); - constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK; + constexpr Index SHARED_PER_WARP = 49152/sizeof(Real) / WARPS_PER_BLOCK; //-------------------------------------------------------------------- Index blocks, sum, start = 0, nextStart = 0; const Index threads = THREADS_PER_BLOCK; @@ -1502,9 +1508,11 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, while (nextStart != rows - 1) { Type type; - nextStart = findLimit(start, SHARED_PER_WARP, matrix, rows, type, sum); + nextStart = findLimit( + start, SHARED_PER_WARP, matrix, rows, type, sum + ); if (type == LONG) { - uint32_t parts = roundUpDivision(sum, ELEMENTS_PER_WARP); + uint32_t parts = roundUpDivision(sum, maxElemPerWarp); for (uint32_t index = 0; index < parts; ++index) { inBlock.emplace_back(start, LONG, index); } @@ -1532,7 +1540,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, neededThreads -= MAX_X_DIM * threads; } - SpMVCSRAdaptive<<>>( + SpMVCSRAdaptive<<>>( inVector, outVector, rowPointers, @@ -1701,7 +1709,7 @@ class CSRDeviceDependentCode< Devices::Cuda > switch(KernelType) { case CSRScalar: - SpMVCSRScalarPrepare( + SpMVCSRScalarPrepare( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), @@ -1723,7 +1731,7 @@ class CSRDeviceDependentCode< Devices::Cuda > ); break; case CSRLight: - SpMVCSRLightPrepare( + SpMVCSRLightPrepare( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), @@ -1735,7 +1743,7 @@ class CSRDeviceDependentCode< Devices::Cuda > ); break; case CSRAdaptive: - SpMVCSRAdaptivePrepare( + SpMVCSRAdaptivePrepare( inVector.getData(), outVector.getData(), matrix, @@ -1748,7 +1756,7 @@ class CSRDeviceDependentCode< Devices::Cuda > ); break; case CSRMultiVector: - SpMVCSRMultiVectorPrepare( + SpMVCSRMultiVectorPrepare( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), @@ -1760,7 +1768,7 @@ class CSRDeviceDependentCode< Devices::Cuda > ); break; case CSRLightWithoutAtomic: - SpMVCSRLightWithoutAtomicPrepare( + SpMVCSRLightWithoutAtomicPrepare( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), -- GitLab From 5c9c5d81734b94c6bea42d8a1d3d40e962716615 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Thu, 9 Jul 2020 21:48:08 +0200 Subject: [PATCH 31/57] Added setBlocks method, commented getColumns in for cycles --- src/TNL/Matrices/Legacy/CSR.h | 26 ++ src/TNL/Matrices/Legacy/CSR_impl.h | 446 ++++++++++++++++------------- 2 files changed, 276 insertions(+), 196 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 49ae6da11..bd7c5fade 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -15,11 +15,30 @@ #include #include +#include // vector for blocks namespace TNL { namespace Matrices { namespace Legacy { +enum class Type { + /* LONG = 0!!! Non zero value rewrites index[1] */ + LONG = 0, + STREAM = 1, + VECTOR = 2 +}; + +union Block { + void set(uint32_t row, Type type = Type::VECTOR, uint32_t index = 0) noexcept { + this->index[0] = row; + this->index[1] = index; + this->byte[7] = (uint8_t)type; + } + + unsigned index[2]; // index[0] is row pointer, index[1] is index in warp + uint8_t byte[8]; // byte[7] is type specificator +}; + #ifdef HAVE_UMFPACK template< typename Matrix, typename Preconditioner > class UmfpackWrapper; @@ -66,6 +85,10 @@ public: constexpr CSRKernel getSpMVKernelType() { return KernelType; }; //enum SPMVCudaKernel { scalar, vector, hybrid }; + + Containers::Vector< Block, Device, Index > blocks; + Index maxElementsPerWarp = 1024; + using Sparse< Real, Device, Index >::getAllocatedElementsCount; CSR(); @@ -229,6 +252,9 @@ public: const IndexType gridIdx ) const; #endif + /* Analyze rowPointers, columnIndecies and values to create block for CSR Adaptive */ + void setBlocks(); + // The following getters allow us to interface TNL with external C-like // libraries such as UMFPACK or SuperLU, which need the raw data. const Containers::Vector< Index, Device, Index >& diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index bdd0fa406..3841515b8 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -23,22 +23,6 @@ #include #endif -enum Type { - STREAM = 0, - VECTOR = 1, - LONG = 2 -}; - -union Block { - Block(uint32_t row, Type type = VECTOR, uint32_t index = 0) noexcept { - this->index[0] = row; - this->index[1] = index; - this->byte[7] = (uint8_t)type; - } - - uint32_t index[2]; // index[0] is row pointer, index[1] is index in warp - uint8_t byte[8]; // byte[7] is type specificator -}; /* Configuration */ constexpr size_t MAX_X_DIM = 2147483647; @@ -128,6 +112,79 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr this->values.setSize( this->rowPointers.getElement( this->rows ) ); this->columnIndexes.setSize( this->rowPointers.getElement( this->rows ) ); this->columnIndexes.setValue( this->columns ); + + // if (KernelType == CSRAdaptive) + this->setBlocks(); +} + +/* Find limit of block */ +template< typename Real, + typename Index, + typename Device, + CSRKernel KernelType, + int maxElemPerWarp> +Index findLimit(const Index start, const Index max, + const CSR< Real, Device, Index, KernelType >& matrix, + const Index size, + Type &type, + Index &sum) { + sum = 0; + for (Index current = start; current < size - 1; ++current) { + Index elements = matrix.getRowPointers().getElement(current + 1) - + matrix.getRowPointers().getElement(current); + sum += elements; + if (sum > max) { + if (current - start > 1) { // extra row + type = Type::STREAM; + return current; + } else { // one long row + if (sum <= maxElemPerWarp) + type = Type::VECTOR; + else + type = Type::LONG; + return current + 1; + } + } + } + + type = Type::STREAM; + return size - 1; // return last row pointer +} + +template< typename Real, + typename Device, + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::setBlocks() +{ + const Index rows = this->getRowPointers().getSize(); + Block *tmpBlocks = new Block[rows]; + Index nextStart = 0, start = 0, cnt = 0, sum = 0; + + while (nextStart != rows - 1) { + Type type; + nextStart = findLimit( + start, this->maxElementsPerWarp, *this, rows, type, sum + ); + if (type == Type::LONG) { + uint32_t parts = roundUpDivision(sum, this->maxElementsPerWarp); + for (uint32_t index = 0; index < parts; ++index) { + tmpBlocks[cnt++].set(start, Type::LONG, index); + } + } else { + tmpBlocks[cnt++].set(start, type); + } + + start = nextStart; + } + tmpBlocks[cnt++].set(nextStart); + + /* Copy to TNL Vector */ + this->blocks.setSize(cnt); + for (Index i = 0; i < cnt; ++i) + this->blocks.setElement(i, tmpBlocks[i]); + + delete [] tmpBlocks; } template< typename Real, @@ -812,8 +869,8 @@ void SpMVCSRAdaptive( const Real *inVector, Real result = 0.0; const Index laneID = threadIdx.x % warpSize; const Index minID = rowPointers[block.index[0]/* minRow */]; - Index i, to, column, offset, maxID; - if (block.byte[7] == 0) { + Index i, to, offset, maxID; + if (block.byte[7] == 1) { /////////////////////////////////////* CSR STREAM *////////////// const Index maxRow = blocks[blockIdx + 1].index[0]; maxID = rowPointers[maxRow]; @@ -821,10 +878,10 @@ void SpMVCSRAdaptive( const Real *inVector, offset = minID - (threadIdx.x / warpSize * sharedPerWarp); /* Copy and calculate elements from global to shared memory, coalesced */ for (i = laneID + minID; i < maxID; i += warpSize) { - column = columnIndexes[i]; - if (column >= getColumns) - continue; // can't be break - shared_res[i - offset] = values[i] * inVector[column]; + // column = columnIndexes[i]; + // if (column >= getColumns) + // continue; // can't be break + shared_res[i - offset] = values[i] * inVector[columnIndexes[i]]; } /* Calculate result */ @@ -837,16 +894,16 @@ void SpMVCSRAdaptive( const Real *inVector, outVector[i] = result; // Write result } - } else if (block.byte[7] == 1) { + } else if (block.byte[7] == 2) { /////////////////////////////////////* CSR VECTOR *////////////// maxID = rowPointers[block.index[0]/* minRow */ + 1]; for (i = minID + laneID; i < maxID; i += warpSize) { - column = columnIndexes[i]; - if (column >= getColumns) - break; + // column = columnIndexes[i]; + // if (column >= getColumns) + // break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } /* Parallel reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 16); @@ -862,13 +919,13 @@ void SpMVCSRAdaptive( const Real *inVector, offset = block.index[1]/* warpInRow */ * maxElemPerWarp; to = minID + (block.index[1]/* warpInRow */ + 1) * maxElemPerWarp; if (to > maxID) to = maxID; - + // if (laneID == 0) printf("BLOCK %d WARP %d\n", (int)block.index[0], (int)block.index[1]); for (i = minID + offset + laneID; i < to; i += warpSize) { - column = columnIndexes[i]; - if (column >= getColumns) - break; + // column = columnIndexes[i]; + // if (column >= getColumns) + // break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } /* Parallel reduction */ @@ -896,16 +953,16 @@ void SpMVCSRScalar( const Real *inVector, if (row >= rows) return; - Index column; + // Index column; Real result = 0.0; const Index endID = rowPointers[row + 1]; for (Index i = rowPointers[row]; i < endID; ++i) { - column = columnIndexes[i]; - if (column >= getColumns) - break; + // column = columnIndexes[i]; + // if (column >= getColumns) + // break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } outVector[row] = result; @@ -939,11 +996,11 @@ void SpMVCSRMultiVector( const Real *inVector, /* Calculate result */ for (Index i = rowPointers[rowID] + (warpID % warps) * warpSize + laneID; i < endID; i += offset) { - Index column = columnIndexes[i]; - if (column >= getColumns) - break; + // Index column = columnIndexes[i]; + // if (column >= getColumns) + // break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } /* Reduction */ @@ -978,13 +1035,8 @@ void SpMVCSRVector( const Real *inVector, Index endID = rowPointers[warpID + 1]; /* Calculate result */ - for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) { - Index column = columnIndexes[i]; - if (column >= getColumns) - break; - - result += values[i] * inVector[column]; - } + for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) + result += values[i] * inVector[columnIndexes[i]]; /* Reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 16); @@ -1027,11 +1079,11 @@ void SpMVCSRLight( const Real *inVector, result = 0.0; for (i = rowPointers[row] + inGroupID; i < maxID; i += groupSize) { - const Index column = columnIndexes[i]; - if (column >= getColumns) - break; + // const Index column = columnIndexes[i]; + // if (column >= getColumns) + // break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } /* Parallel reduction */ @@ -1064,11 +1116,11 @@ void SpMVCSRLightWithoutAtomic2( const Real *inVector, Real result = 0.0; for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 2) { - Index column = columnIndexes[i]; - if (column >= getColumns) - break; + // Index column = columnIndexes[i]; + // if (column >= getColumns) + // break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } /* Parallel reduction */ @@ -1099,11 +1151,11 @@ void SpMVCSRLightWithoutAtomic4( const Real *inVector, Real result = 0.0; for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 4) { - Index column = columnIndexes[i]; - if (column >= getColumns) - break; + // Index column = columnIndexes[i]; + // if (column >= getColumns) + // break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } /* Parallel reduction */ @@ -1130,17 +1182,17 @@ void SpMVCSRLightWithoutAtomic8( const Real *inVector, if (row >= rows) return; - Index i, column; + Index i; const Index inGroupID = threadIdx.x % 8; const Index maxID = rowPointers[row + 1]; Real result = 0.0; for (i = rowPointers[row] + inGroupID; i < maxID; i += 8) { - column = columnIndexes[i]; - if (column >= getColumns) - break; + // column = columnIndexes[i]; + // if (column >= getColumns) + // break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } /* Parallel reduction */ @@ -1169,17 +1221,17 @@ void SpMVCSRLightWithoutAtomic16( const Real *inVector, return; - Index i, column; + Index i; const Index inGroupID = threadIdx.x % 16; const Index maxID = rowPointers[row + 1]; Real result = 0.0; for (i = rowPointers[row] + inGroupID; i < maxID; i += 16) { - column = columnIndexes[i]; - if (column >= getColumns) - break; + // column = columnIndexes[i]; + // if (column >= getColumns) + // break; - result += values[i] * inVector[column]; + result += values[i] * inVector[columnIndexes[i]]; } /* Parallel reduction */ @@ -1444,38 +1496,38 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, } /* Find limit of block */ -template< typename Real, - typename Index, - typename Device, - CSRKernel KernelType, - int maxElemPerWarp> -Index findLimit(const Index start, const Index max, - const CSR< Real, Device, Index, KernelType >& matrix, - const Index size, - Type &type, - Index &sum) { - sum = 0; - for (Index current = start; current < size - 1; ++current) { - Index elements = matrix.getRowPointers().getElement(current + 1) - - matrix.getRowPointers().getElement(current); - sum += elements; - if (sum > max) { - if (current - start > 1) { // extra row - type = STREAM; - return current; - } else { // one long row - if (sum <= maxElemPerWarp) - type = VECTOR; - else - type = LONG; - return current + 1; - } - } - } - - type = STREAM; - return size - 1; // return last row pointer -} +// template< typename Real, +// typename Index, +// typename Device, +// CSRKernel KernelType, +// int maxElemPerWarp> +// Index findLimit(const Index start, const Index max, +// const CSR< Real, Device, Index, KernelType >& matrix, +// const Index size, +// Type &type, +// Index &sum) { +// sum = 0; +// for (Index current = start; current < size - 1; ++current) { +// Index elements = matrix.getRowPointers().getElement(current + 1) - +// matrix.getRowPointers().getElement(current); +// sum += elements; +// if (sum > max) { +// if (current - start > 1) { // extra row +// type = STREAM; +// return current; +// } else { // one long row +// if (sum <= maxElemPerWarp) +// type = VECTOR; +// else +// type = LONG; +// return current + 1; +// } +// } +// } + +// type = STREAM; +// return size - 1; // return last row pointer +// } template< typename Real, typename Index, @@ -1499,37 +1551,39 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32; constexpr Index SHARED_PER_WARP = 49152/sizeof(Real) / WARPS_PER_BLOCK; //-------------------------------------------------------------------- - Index blocks, sum, start = 0, nextStart = 0; + // Index blocks, sum, start = 0, nextStart = 0; + Index blocks; const Index threads = THREADS_PER_BLOCK; /* Fill blocks */ - std::vector inBlock; - inBlock.reserve(rows); // reserve space to avoid reallocation - - while (nextStart != rows - 1) { - Type type; - nextStart = findLimit( - start, SHARED_PER_WARP, matrix, rows, type, sum - ); - if (type == LONG) { - uint32_t parts = roundUpDivision(sum, maxElemPerWarp); - for (uint32_t index = 0; index < parts; ++index) { - inBlock.emplace_back(start, LONG, index); - } - } else { - inBlock.emplace_back(start, type); - } - - start = nextStart; - } - inBlock.emplace_back(nextStart); - - /* blocks to GPU */ - Block *blocksAdaptive = nullptr; - cudaMalloc((void **)&blocksAdaptive, sizeof(*blocksAdaptive) * inBlock.size()); - cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(*blocksAdaptive), cudaMemcpyHostToDevice); - - size_t neededThreads = inBlock.size() * 32; // one warp per block + // std::vector inBlock; + // inBlock.reserve(rows); // reserve space to avoid reallocation + + // while (nextStart != rows - 1) { + // Type type; + // nextStart = findLimit( + // start, SHARED_PER_WARP, matrix, rows, type, sum + // ); + // if (type == LONG) { + // uint32_t parts = roundUpDivision(sum, maxElemPerWarp); + // for (uint32_t index = 0; index < parts; ++index) { + // inBlock.emplace_back(start, LONG, index); + // } + // } else { + // inBlock.emplace_back(start, type); + // } + + // start = nextStart; + // } + // inBlock.emplace_back(nextStart); + + // /* blocks to GPU */ + // Block *blocksAdaptive = nullptr; + // cudaMalloc((void **)&blocksAdaptive, sizeof(*blocksAdaptive) * inBlock.size()); + // cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(*blocksAdaptive), cudaMemcpyHostToDevice); + + // size_t neededThreads = inBlock.size() * 32; // one warp per block + size_t neededThreads = matrix.blocks.getSize() * 32; // one warp per block /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { @@ -1546,14 +1600,14 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, rowPointers, columnIndexes, values, - blocksAdaptive, - inBlock.size() - 1, // last block shouldn't be used + matrix.blocks.getData(), + matrix.blocks.getSize() - 1, // last block shouldn't be used getColumns, grid ); } - cudaFree(blocksAdaptive); + // cudaFree(blocksAdaptive); } #endif @@ -1706,43 +1760,43 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - switch(KernelType) - { - case CSRScalar: - SpMVCSRScalarPrepare( - inVector.getData(), - outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - case CSRVector: - SpMVCSRVectorPrepare( - inVector.getData(), - outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - case CSRLight: - SpMVCSRLightPrepare( - inVector.getData(), - outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - case CSRAdaptive: + // switch(KernelType) + // { + // case CSRScalar: + // SpMVCSRScalarPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // case CSRVector: + // SpMVCSRVectorPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // case CSRLight: + // SpMVCSRLightPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getValues().getSize(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // case CSRAdaptive: SpMVCSRAdaptivePrepare( inVector.getData(), outVector.getData(), @@ -1754,32 +1808,32 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getRowPointers().getSize(), // don't add -1 ! matrix.getColumns() ); - break; - case CSRMultiVector: - SpMVCSRMultiVectorPrepare( - inVector.getData(), - outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - case CSRLightWithoutAtomic: - SpMVCSRLightWithoutAtomicPrepare( - inVector.getData(), - outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() - ); - break; - } + // break; + // case CSRMultiVector: + // SpMVCSRMultiVectorPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getValues().getSize(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // case CSRLightWithoutAtomic: + // SpMVCSRLightWithoutAtomicPrepare( + // inVector.getData(), + // outVector.getData(), + // matrix.getRowPointers().getData(), + // matrix.getColumnIndexes().getData(), + // matrix.getValues().getData(), + // matrix.getValues().getSize(), + // matrix.getRowPointers().getSize() - 1, + // matrix.getColumns() + // ); + // break; + // } #endif /* HAVE_CUDA */ #endif } -- GitLab From e3b27a6102f6a80cc237d6f2bb722437b3586fe3 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Fri, 10 Jul 2020 16:12:34 +0200 Subject: [PATCH 32/57] Fixed blocks filling --- src/TNL/Matrices/Legacy/CSR.h | 3 +- src/TNL/Matrices/Legacy/CSR_impl.h | 227 +++++++++++------------------ 2 files changed, 84 insertions(+), 146 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index bd7c5fade..4c46d9bb0 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -29,7 +29,7 @@ enum class Type { }; union Block { - void set(uint32_t row, Type type = Type::VECTOR, uint32_t index = 0) noexcept { + Block(uint32_t row, Type type = Type::VECTOR, uint32_t index = 0) noexcept { this->index[0] = row; this->index[1] = index; this->byte[7] = (uint8_t)type; @@ -87,6 +87,7 @@ public: Containers::Vector< Block, Device, Index > blocks; + Index maxElementsPerWarp = 1024; using Sparse< Real, Device, Index >::getAllocatedElementsCount; diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 3841515b8..106a9f2c4 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include // for blocks in CSR Adaptive #ifdef HAVE_CUSPARSE #include @@ -113,7 +113,7 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr this->columnIndexes.setSize( this->rowPointers.getElement( this->rows ) ); this->columnIndexes.setValue( this->columns ); - // if (KernelType == CSRAdaptive) + if (KernelType == CSRAdaptive) this->setBlocks(); } @@ -121,11 +121,11 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr template< typename Real, typename Index, typename Device, - CSRKernel KernelType, - int maxElemPerWarp> + CSRKernel KernelType> Index findLimit(const Index start, const Index max, const CSR< Real, Device, Index, KernelType >& matrix, const Index size, + const Index maxElemPerWarp, Type &type, Index &sum) { sum = 0; @@ -158,33 +158,34 @@ template< typename Real, void CSR< Real, Device, Index, KernelType >::setBlocks() { const Index rows = this->getRowPointers().getSize(); - Block *tmpBlocks = new Block[rows]; - Index nextStart = 0, start = 0, cnt = 0, sum = 0; + Index sum, start = 0, nextStart = 0; + + /* Fill blocks */ + std::vector inBlock; + inBlock.reserve(rows); // reserve space to avoid reallocation while (nextStart != rows - 1) { Type type; - nextStart = findLimit( - start, this->maxElementsPerWarp, *this, rows, type, sum + nextStart = findLimit( + start, 384, *this, rows, this->maxElementsPerWarp, type, sum ); if (type == Type::LONG) { - uint32_t parts = roundUpDivision(sum, this->maxElementsPerWarp); + uint32_t parts = roundUpDivision(sum, 384); for (uint32_t index = 0; index < parts; ++index) { - tmpBlocks[cnt++].set(start, Type::LONG, index); + inBlock.emplace_back(start, Type::LONG, index); } } else { - tmpBlocks[cnt++].set(start, type); + inBlock.emplace_back(start, type); } start = nextStart; } - tmpBlocks[cnt++].set(nextStart); - - /* Copy to TNL Vector */ - this->blocks.setSize(cnt); - for (Index i = 0; i < cnt; ++i) - this->blocks.setElement(i, tmpBlocks[i]); + inBlock.emplace_back(nextStart); - delete [] tmpBlocks; + /* Copy values */ + this->blocks.setSize(inBlock.size()); + for (size_t i = 0; i < inBlock.size(); ++i) + this->blocks.setElement(i, inBlock[i]); } template< typename Real, @@ -1495,40 +1496,6 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, } } -/* Find limit of block */ -// template< typename Real, -// typename Index, -// typename Device, -// CSRKernel KernelType, -// int maxElemPerWarp> -// Index findLimit(const Index start, const Index max, -// const CSR< Real, Device, Index, KernelType >& matrix, -// const Index size, -// Type &type, -// Index &sum) { -// sum = 0; -// for (Index current = start; current < size - 1; ++current) { -// Index elements = matrix.getRowPointers().getElement(current + 1) - -// matrix.getRowPointers().getElement(current); -// sum += elements; -// if (sum > max) { -// if (current - start > 1) { // extra row -// type = STREAM; -// return current; -// } else { // one long row -// if (sum <= maxElemPerWarp) -// type = VECTOR; -// else -// type = LONG; -// return current + 1; -// } -// } -// } - -// type = STREAM; -// return size - 1; // return last row pointer -// } - template< typename Real, typename Index, typename Device, @@ -1551,38 +1518,10 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32; constexpr Index SHARED_PER_WARP = 49152/sizeof(Real) / WARPS_PER_BLOCK; //-------------------------------------------------------------------- - // Index blocks, sum, start = 0, nextStart = 0; Index blocks; const Index threads = THREADS_PER_BLOCK; /* Fill blocks */ - // std::vector inBlock; - // inBlock.reserve(rows); // reserve space to avoid reallocation - - // while (nextStart != rows - 1) { - // Type type; - // nextStart = findLimit( - // start, SHARED_PER_WARP, matrix, rows, type, sum - // ); - // if (type == LONG) { - // uint32_t parts = roundUpDivision(sum, maxElemPerWarp); - // for (uint32_t index = 0; index < parts; ++index) { - // inBlock.emplace_back(start, LONG, index); - // } - // } else { - // inBlock.emplace_back(start, type); - // } - - // start = nextStart; - // } - // inBlock.emplace_back(nextStart); - - // /* blocks to GPU */ - // Block *blocksAdaptive = nullptr; - // cudaMalloc((void **)&blocksAdaptive, sizeof(*blocksAdaptive) * inBlock.size()); - // cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(*blocksAdaptive), cudaMemcpyHostToDevice); - - // size_t neededThreads = inBlock.size() * 32; // one warp per block size_t neededThreads = matrix.blocks.getSize() * 32; // one warp per block /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { @@ -1606,8 +1545,6 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, grid ); } - - // cudaFree(blocksAdaptive); } #endif @@ -1760,43 +1697,43 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - // switch(KernelType) - // { - // case CSRScalar: - // SpMVCSRScalarPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // case CSRVector: - // SpMVCSRVectorPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // case CSRLight: - // SpMVCSRLightPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getValues().getSize(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // case CSRAdaptive: + switch(KernelType) + { + case CSRScalar: + SpMVCSRScalarPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRVector: + SpMVCSRVectorPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRLight: + SpMVCSRLightPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getValues().getSize(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRAdaptive: SpMVCSRAdaptivePrepare( inVector.getData(), outVector.getData(), @@ -1808,32 +1745,32 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getRowPointers().getSize(), // don't add -1 ! matrix.getColumns() ); - // break; - // case CSRMultiVector: - // SpMVCSRMultiVectorPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getValues().getSize(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // case CSRLightWithoutAtomic: - // SpMVCSRLightWithoutAtomicPrepare( - // inVector.getData(), - // outVector.getData(), - // matrix.getRowPointers().getData(), - // matrix.getColumnIndexes().getData(), - // matrix.getValues().getData(), - // matrix.getValues().getSize(), - // matrix.getRowPointers().getSize() - 1, - // matrix.getColumns() - // ); - // break; - // } + break; + case CSRMultiVector: + SpMVCSRMultiVectorPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getValues().getSize(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + case CSRLightWithoutAtomic: + SpMVCSRLightWithoutAtomicPrepare( + inVector.getData(), + outVector.getData(), + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getValues().getSize(), + matrix.getRowPointers().getSize() - 1, + matrix.getColumns() + ); + break; + } #endif /* HAVE_CUDA */ #endif } -- GitLab From 6b2330d63613a612b52cc975a4d3c023f7695a24 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Fri, 10 Jul 2020 16:32:29 +0200 Subject: [PATCH 33/57] Fixed compilation error --- src/TNL/Matrices/Legacy/CSR_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 106a9f2c4..4ee332388 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -166,7 +166,7 @@ void CSR< Real, Device, Index, KernelType >::setBlocks() while (nextStart != rows - 1) { Type type; - nextStart = findLimit( + nextStart = findLimit( start, 384, *this, rows, this->maxElementsPerWarp, type, sum ); if (type == Type::LONG) { -- GitLab From c5c1cd7c09d0231b927048e13e1098bcced83a93 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Fri, 10 Jul 2020 18:24:46 +0200 Subject: [PATCH 34/57] Added original CSR Light --- src/TNL/Matrices/Legacy/CSR.h | 11 -- src/TNL/Matrices/Legacy/CSR_impl.h | 260 +++++++++++++---------------- 2 files changed, 116 insertions(+), 155 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 4c46d9bb0..439139e3e 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -242,17 +242,6 @@ public: __cuda_callable__ IndexType getHybridModeSplit() const; -#ifdef HAVE_CUDA - - template< typename InVector, - typename OutVector, - int warpSize > - __device__ - void spmvCudaVectorized( const InVector& inVector, - OutVector& outVector, - const IndexType gridIdx ) const; -#endif - /* Analyze rowPointers, columnIndecies and values to create block for CSR Adaptive */ void setBlocks(); diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 4ee332388..100fdad18 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -799,52 +799,6 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const #ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - CSRKernel KernelType > - template< typename InVector, - typename OutVector, - int warpSize > -__device__ -void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector, - OutVector& outVector, - const IndexType gridIdx ) const -{ - IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const IndexType warpStart = warpSize * ( globalIdx / warpSize ); - const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); - const IndexType inWarpIdx = globalIdx % warpSize; - - volatile Real* aux = Cuda::getSharedMemory< Real >(); - for( IndexType row = warpStart; row < warpEnd; row++ ) - { - aux[ threadIdx.x ] = 0.0; - - IndexType elementPtr = this->rowPointers[ row ] + inWarpIdx; - const IndexType rowEnd = this->rowPointers[ row + 1 ]; - IndexType column; - while( elementPtr < rowEnd && - ( column = this->columnIndexes[ elementPtr ] ) < this->getColumns() ) - { - aux[ threadIdx.x ] += inVector[ column ] * this->values[ elementPtr ]; - elementPtr += warpSize; - } - if( warpSize == 32 ) - if( inWarpIdx < 16 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 16 ]; - if( warpSize >= 16 ) - if( inWarpIdx < 8 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 8 ]; - if( warpSize >= 8 ) - if( inWarpIdx < 4 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 4 ]; - if( warpSize >= 4 ) - if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ]; - if( warpSize >= 2 ) - if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ]; - if( inWarpIdx == 0 ) - outVector[ row ] = aux[ threadIdx.x ]; - } -} - template< typename Real, typename Index, int warpSize, @@ -868,7 +822,7 @@ void SpMVCSRAdaptive( const Real *inVector, Block block = blocks[blockIdx]; Real result = 0.0; - const Index laneID = threadIdx.x % warpSize; + const Index laneID = threadIdx.x & 31; // & is cheaper than % const Index minID = rowPointers[block.index[0]/* minRow */]; Index i, to, offset, maxID; if (block.byte[7] == 1) { @@ -878,12 +832,8 @@ void SpMVCSRAdaptive( const Real *inVector, /* offset between shared and global addresses */ offset = minID - (threadIdx.x / warpSize * sharedPerWarp); /* Copy and calculate elements from global to shared memory, coalesced */ - for (i = laneID + minID; i < maxID; i += warpSize) { - // column = columnIndexes[i]; - // if (column >= getColumns) - // continue; // can't be break + for (i = laneID + minID; i < maxID; i += warpSize) shared_res[i - offset] = values[i] * inVector[columnIndexes[i]]; - } /* Calculate result */ for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { @@ -899,13 +849,9 @@ void SpMVCSRAdaptive( const Real *inVector, /////////////////////////////////////* CSR VECTOR *////////////// maxID = rowPointers[block.index[0]/* minRow */ + 1]; - for (i = minID + laneID; i < maxID; i += warpSize) { - // column = columnIndexes[i]; - // if (column >= getColumns) - // break; - + for (i = minID + laneID; i < maxID; i += warpSize) result += values[i] * inVector[columnIndexes[i]]; - } + /* Parallel reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); @@ -920,14 +866,8 @@ void SpMVCSRAdaptive( const Real *inVector, offset = block.index[1]/* warpInRow */ * maxElemPerWarp; to = minID + (block.index[1]/* warpInRow */ + 1) * maxElemPerWarp; if (to > maxID) to = maxID; - // if (laneID == 0) printf("BLOCK %d WARP %d\n", (int)block.index[0], (int)block.index[1]); - for (i = minID + offset + laneID; i < to; i += warpSize) { - // column = columnIndexes[i]; - // if (column >= getColumns) - // break; - + for (i = minID + offset + laneID; i < to; i += warpSize) result += values[i] * inVector[columnIndexes[i]]; - } /* Parallel reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 16); @@ -954,17 +894,11 @@ void SpMVCSRScalar( const Real *inVector, if (row >= rows) return; - // Index column; Real result = 0.0; const Index endID = rowPointers[row + 1]; - for (Index i = rowPointers[row]; i < endID; ++i) { - // column = columnIndexes[i]; - // if (column >= getColumns) - // break; - + for (Index i = rowPointers[row]; i < endID; ++i) result += values[i] * inVector[columnIndexes[i]]; - } outVector[row] = result; } @@ -989,7 +923,7 @@ void SpMVCSRMultiVector( const Real *inVector, if (rowID >= rows) return; - const Index laneID = threadIdx.x % warpSize; + const Index laneID = threadIdx.x & 31; // & is cheaper than % const Index offset = warps * warpSize; Real result = 0.0; @@ -997,10 +931,6 @@ void SpMVCSRMultiVector( const Real *inVector, /* Calculate result */ for (Index i = rowPointers[rowID] + (warpID % warps) * warpSize + laneID; i < endID; i += offset) { - // Index column = columnIndexes[i]; - // if (column >= getColumns) - // break; - result += values[i] * inVector[columnIndexes[i]]; } @@ -1032,7 +962,7 @@ void SpMVCSRVector( const Real *inVector, return; Real result = 0.0; - const Index laneID = threadIdx.x % warpSize; + const Index laneID = threadIdx.x & 31; // & is cheaper than % Index endID = rowPointers[warpID + 1]; /* Calculate result */ @@ -1050,7 +980,9 @@ void SpMVCSRVector( const Real *inVector, } template< typename Real, - typename Index > + typename Index, + int groupSize, + int MAX_NUM_VECTORS_PER_BLOCK > __global__ void SpMVCSRLight( const Real *inVector, Real* outVector, @@ -1059,41 +991,78 @@ void SpMVCSRLight( const Real *inVector, const Real* values, const Index rows, const Index getColumns, - const Index groupSize, unsigned *rowCnt) { - const Index groupID = threadIdx.x / groupSize; - const Index inGroupID = threadIdx.x % groupSize; - Index row, maxID, i; - Real result; + Index i; + Real sum; + Index row; + Index rowStart, rowEnd; + const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/ + const Index vectorId = threadIdx.x / groupSize; /*vector index in the thread block*/ + const Index warpLaneId = threadIdx.x & 31; /*lane index in the warp*/ + const Index warpVectorId = warpLaneId / groupSize; /*vector index in the warp*/ + + __shared__ volatile Index space[MAX_NUM_VECTORS_PER_BLOCK][2]; + + /*get the row index*/ + if (warpLaneId == 0) { + row = atomicAdd(rowCnt, 32 / groupSize); + } + /*broadcast the value to other threads in the same warp and compute the row index of each vector*/ + row = __shfl(row, 0) + warpVectorId; - while (true) { + /*check the row range*/ + while (row < rows) { - /* Get row number */ - if (inGroupID == 0) row = atomicAdd(rowCnt, 1); + /*use two threads to fetch the row offset*/ + if (laneId < 2) { + space[vectorId][laneId] = rowPointers[row + laneId]; + } + rowStart = space[vectorId][0]; + rowEnd = space[vectorId][1]; - /* share row number in group */ - row = __shfl_sync(0xFFFFFFFF, row, groupID * groupSize); - if (row >= rows) - return; + /*there are non-zero elements in the current row*/ + sum = 0; + /*compute dot product*/ + if (groupSize == 32) { - maxID = rowPointers[row + 1]; + /*ensure aligned memory access*/ + i = rowStart - (rowStart & (groupSize - 1)) + laneId; - result = 0.0; - for (i = rowPointers[row] + inGroupID; i < maxID; i += groupSize) { - // const Index column = columnIndexes[i]; - // if (column >= getColumns) - // break; + /*process the unaligned part*/ + if (i >= rowStart && i < rowEnd) { + sum += values[i] * inVector[columnIndexes[i]]; + } - result += values[i] * inVector[columnIndexes[i]]; + /*process the aligned part*/ + for (i += groupSize; i < rowEnd; i += groupSize) { + sum += values[i] * inVector[columnIndexes[i]]; + } + } else { + /*regardless of the global memory access alignment*/ + for (i = rowStart + laneId; i < rowEnd; i += + groupSize) { + sum += values[i] * inVector[columnIndexes[i]]; + } + } + /*intra-vector reduction*/ + for (i = groupSize >> 1; i > 0; i >>= 1) { + sum += __shfl_down(sum, i, groupSize); } - /* Parallel reduction */ - for (i = groupSize >> 1; i > 0; i >>= 1) - result += __shfl_down_sync(0xFFFFFFFF, result, i); - /* Write result */ - if (inGroupID == 0) - outVector[row] = result; - } + /*save the results and get a new row*/ + if (laneId == 0) { + /*save the results*/ + outVector[row] = sum; + } + + /*get a new row index*/ + if(warpLaneId == 0){ + row = atomicAdd(rowCnt, 32 / groupSize); + } + /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/ + row = __shfl(row, 0) + warpVectorId; + + }/*while*/ } template< typename Real, @@ -1112,17 +1081,12 @@ void SpMVCSRLightWithoutAtomic2( const Real *inVector, if (row >= rows) return; - const Index inGroupID = threadIdx.x % 2; + const Index inGroupID = threadIdx.x & 1; // & is cheaper than % const Index maxID = rowPointers[row + 1]; Real result = 0.0; - for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 2) { - // Index column = columnIndexes[i]; - // if (column >= getColumns) - // break; - + for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 2) result += values[i] * inVector[columnIndexes[i]]; - } /* Parallel reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 1); @@ -1147,17 +1111,12 @@ void SpMVCSRLightWithoutAtomic4( const Real *inVector, if (row >= rows) return; - const Index inGroupID = threadIdx.x % 4; + const Index inGroupID = threadIdx.x & 3; // & is cheaper than % const Index maxID = rowPointers[row + 1]; Real result = 0.0; - for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 4) { - // Index column = columnIndexes[i]; - // if (column >= getColumns) - // break; - + for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 4) result += values[i] * inVector[columnIndexes[i]]; - } /* Parallel reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 2); @@ -1184,17 +1143,12 @@ void SpMVCSRLightWithoutAtomic8( const Real *inVector, return; Index i; - const Index inGroupID = threadIdx.x % 8; + const Index inGroupID = threadIdx.x & 7; // & is cheaper than % const Index maxID = rowPointers[row + 1]; Real result = 0.0; - for (i = rowPointers[row] + inGroupID; i < maxID; i += 8) { - // column = columnIndexes[i]; - // if (column >= getColumns) - // break; - + for (i = rowPointers[row] + inGroupID; i < maxID; i += 8) result += values[i] * inVector[columnIndexes[i]]; - } /* Parallel reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 4); @@ -1223,17 +1177,12 @@ void SpMVCSRLightWithoutAtomic16( const Real *inVector, Index i; - const Index inGroupID = threadIdx.x % 16; + const Index inGroupID = threadIdx.x & 15; // & is cheaper than % const Index maxID = rowPointers[row + 1]; Real result = 0.0; - for (i = rowPointers[row] + inGroupID; i < maxID; i += 16) { - // column = columnIndexes[i]; - // if (column >= getColumns) - // break; - + for (i = rowPointers[row] + inGroupID; i < maxID; i += 16) result += values[i] * inVector[columnIndexes[i]]; - } /* Parallel reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 8); @@ -1327,7 +1276,6 @@ void SpMVCSRLightPrepare( const Real *inVector, const Index rows, const Index getColumns) { const Index threads = 1024; // max block size - Index groupSize; /* Copy rowCnt to GPU */ unsigned rowCnt = 0; unsigned *kernelRowCnt = nullptr; @@ -1341,15 +1289,18 @@ void SpMVCSRLightPrepare( const Real *inVector, const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row if (nnz <= 2) - groupSize = 2; + SpMVCSRLight<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + rows, + getColumns, + kernelRowCnt + ); else if (nnz <= 4) - groupSize = 4; - else if (nnz <= 64) - groupSize = 8; - else - groupSize = 32; - - SpMVCSRLight<<>>( + SpMVCSRLight<<>>( inVector, outVector, rowPointers, @@ -1357,9 +1308,30 @@ void SpMVCSRLightPrepare( const Real *inVector, values, rows, getColumns, - groupSize, kernelRowCnt - ); + ); + else if (nnz <= 64) + SpMVCSRLight<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + rows, + getColumns, + kernelRowCnt + ); + else + SpMVCSRLight<<>>( + inVector, + outVector, + rowPointers, + columnIndexes, + values, + rows, + getColumns, + kernelRowCnt + ); cudaFree(kernelRowCnt); } -- GitLab From f10e5072b6f70938a4f68a083fe77373b6e4ea89 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Fri, 10 Jul 2020 19:25:33 +0200 Subject: [PATCH 35/57] Code cleaning --- src/TNL/Matrices/Legacy/CSR.h | 14 +- src/TNL/Matrices/Legacy/CSR_impl.h | 266 ++++++++++++----------------- 2 files changed, 117 insertions(+), 163 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 439139e3e..82a661021 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -15,7 +15,6 @@ #include #include -#include // vector for blocks namespace TNL { namespace Matrices { @@ -28,15 +27,16 @@ enum class Type { VECTOR = 2 }; +template union Block { - Block(uint32_t row, Type type = Type::VECTOR, uint32_t index = 0) noexcept { + Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { this->index[0] = row; this->index[1] = index; - this->byte[7] = (uint8_t)type; + this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; } - unsigned index[2]; // index[0] is row pointer, index[1] is index in warp - uint8_t byte[8]; // byte[7] is type specificator + Index index[2]; // index[0] is row pointer, index[1] is index in warp + uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator }; #ifdef HAVE_UMFPACK @@ -50,7 +50,7 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; -enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, +enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRLight2, CSRAdaptive, CSRMultiVector, CSRLightWithoutAtomic }; template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar > @@ -86,7 +86,7 @@ public: //enum SPMVCudaKernel { scalar, vector, hybrid }; - Containers::Vector< Block, Device, Index > blocks; + Containers::Vector< Block, Device, Index > blocks; Index maxElementsPerWarp = 1024; diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 100fdad18..21514b7d3 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -161,7 +161,7 @@ void CSR< Real, Device, Index, KernelType >::setBlocks() Index sum, start = 0, nextStart = 0; /* Fill blocks */ - std::vector inBlock; + std::vector> inBlock; inBlock.reserve(rows); // reserve space to avoid reallocation while (nextStart != rows - 1) { @@ -170,8 +170,8 @@ void CSR< Real, Device, Index, KernelType >::setBlocks() start, 384, *this, rows, this->maxElementsPerWarp, type, sum ); if (type == Type::LONG) { - uint32_t parts = roundUpDivision(sum, 384); - for (uint32_t index = 0; index < parts; ++index) { + Index parts = roundUpDivision(sum, 384); + for (Index index = 0; index < parts; ++index) { inBlock.emplace_back(start, Type::LONG, index); } } else { @@ -810,9 +810,8 @@ void SpMVCSRAdaptive( const Real *inVector, const Index* rowPointers, const Index* columnIndexes, const Real* values, - const Block *blocks, + const Block *blocks, Index blocksSize, - Index getColumns, Index gridID) { __shared__ Real shared_res[49152/sizeof(Real)]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; @@ -820,12 +819,12 @@ void SpMVCSRAdaptive( const Real *inVector, if (blockIdx >= blocksSize) return; - Block block = blocks[blockIdx]; + Block block = blocks[blockIdx]; Real result = 0.0; const Index laneID = threadIdx.x & 31; // & is cheaper than % const Index minID = rowPointers[block.index[0]/* minRow */]; Index i, to, offset, maxID; - if (block.byte[7] == 1) { + if (block.byte[sizeof(Index) == 4 ? 7 : 15] == 1) { /////////////////////////////////////* CSR STREAM *////////////// const Index maxRow = blocks[blockIdx + 1].index[0]; maxID = rowPointers[maxRow]; @@ -845,7 +844,7 @@ void SpMVCSRAdaptive( const Real *inVector, outVector[i] = result; // Write result } - } else if (block.byte[7] == 2) { + } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] == 2) { /////////////////////////////////////* CSR VECTOR *////////////// maxID = rowPointers[block.index[0]/* minRow */ + 1]; @@ -888,7 +887,6 @@ void SpMVCSRScalar( const Real *inVector, const Index* columnIndexes, const Real* values, const Index rows, - const Index getColumns, const Index gridID) { const Index row = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; if (row >= rows) @@ -913,7 +911,6 @@ void SpMVCSRMultiVector( const Real *inVector, const Index* columnIndexes, const Real* values, const Index rows, - const Index getColumns, const Index warps, // warps per row const Index gridID) { @@ -954,7 +951,6 @@ void SpMVCSRVector( const Real *inVector, const Index* columnIndexes, const Real* values, const Index rows, - const Index getColumns, const Index gridID) { const Index warpID = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; @@ -990,7 +986,6 @@ void SpMVCSRLight( const Real *inVector, const Index* columnIndexes, const Real* values, const Index rows, - const Index getColumns, unsigned *rowCnt) { Index i; Real sum; @@ -1008,7 +1003,7 @@ void SpMVCSRLight( const Real *inVector, row = atomicAdd(rowCnt, 32 / groupSize); } /*broadcast the value to other threads in the same warp and compute the row index of each vector*/ - row = __shfl(row, 0) + warpVectorId; + row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; /*check the row range*/ while (row < rows) { @@ -1046,7 +1041,7 @@ void SpMVCSRLight( const Real *inVector, } /*intra-vector reduction*/ for (i = groupSize >> 1; i > 0; i >>= 1) { - sum += __shfl_down(sum, i, groupSize); + sum += __shfl_down_sync(0xFFFFFFFF, sum, i); } /*save the results and get a new row*/ @@ -1060,7 +1055,7 @@ void SpMVCSRLight( const Real *inVector, row = atomicAdd(rowCnt, 32 / groupSize); } /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/ - row = __shfl(row, 0) + warpVectorId; + row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; }/*while*/ } @@ -1074,7 +1069,6 @@ void SpMVCSRLightWithoutAtomic2( const Real *inVector, const Index* columnIndexes, const Real* values, const Index rows, - const Index getColumns, const Index gridID) { const Index row = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 2; @@ -1104,7 +1098,6 @@ void SpMVCSRLightWithoutAtomic4( const Real *inVector, const Index* columnIndexes, const Real* values, const Index rows, - const Index getColumns, const Index gridID) { const Index row = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4; @@ -1135,7 +1128,6 @@ void SpMVCSRLightWithoutAtomic8( const Real *inVector, const Index* columnIndexes, const Real* values, const Index rows, - const Index getColumns, const Index gridID) { const Index row = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8; @@ -1168,7 +1160,6 @@ void SpMVCSRLightWithoutAtomic16( const Real *inVector, const Index* columnIndexes, const Real* values, const Index rows, - const Index getColumns, const Index gridID) { const Index row = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 16; @@ -1195,16 +1186,14 @@ void SpMVCSRLightWithoutAtomic16( const Real *inVector, } template< typename Real, - typename Index > + typename Index, + typename Device, + CSRKernel KernelType> void SpMVCSRScalarPrepare( const Real *inVector, Real* outVector, - const Index* rowPointers, - const Index* columnIndexes, - const Real* values, - const Index rows, - const Index getColumns) { + const CSR< Real, Device, Index, KernelType >& matrix) { const Index threads = 1024; // block size - size_t neededThreads = rows; + size_t neededThreads = matrix.getRowPointers().getSize() - 1; Index blocks; /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { @@ -1219,11 +1208,10 @@ void SpMVCSRScalarPrepare( const Real *inVector, SpMVCSRScalar<<>>( inVector, outVector, - rowPointers, - columnIndexes, - values, - rows, - getColumns, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getRowPointers().getSize() - 1, grid ); } @@ -1231,16 +1219,14 @@ void SpMVCSRScalarPrepare( const Real *inVector, template< typename Real, typename Index, + typename Device, + CSRKernel KernelType, int warpSize > void SpMVCSRVectorPrepare( const Real *inVector, Real* outVector, - const Index* rowPointers, - const Index* columnIndexes, - const Real* values, - const Index rows, - const Index getColumns) { + const CSR< Real, Device, Index, KernelType >& matrix) { const Index threads = 1024; // block size - size_t neededThreads = rows * warpSize; + size_t neededThreads = matrix.getRowPointers().getSize() * warpSize; Index blocks; /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { @@ -1255,81 +1241,75 @@ void SpMVCSRVectorPrepare( const Real *inVector, SpMVCSRVector<<>>( inVector, outVector, - rowPointers, - columnIndexes, - values, - rows, - getColumns, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getRowPointers().getSize() - 1, grid ); } } template< typename Real, - typename Index > + typename Index, + typename Device, + CSRKernel KernelType, + int warpSize > void SpMVCSRLightPrepare( const Real *inVector, Real* outVector, - const Index* rowPointers, - const Index* columnIndexes, - const Real* values, - const Index valuesSize, - const Index rows, - const Index getColumns) { + const CSR< Real, Device, Index, KernelType >& matrix) { const Index threads = 1024; // max block size + const Index rows = matrix.getRowPointers().getSize() - 1; /* Copy rowCnt to GPU */ unsigned rowCnt = 0; unsigned *kernelRowCnt = nullptr; cudaMalloc((void **)&kernelRowCnt, sizeof(*kernelRowCnt)); cudaMemcpy(kernelRowCnt, &rowCnt, sizeof(*kernelRowCnt), cudaMemcpyHostToDevice); - + /* Get info about GPU */ cudaDeviceProp properties; cudaGetDeviceProperties( &properties, Cuda::DeviceInfo::getActiveDevice() ); - Index blocks = + const Index blocks = properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads; - const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row + const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row if (nnz <= 2) SpMVCSRLight<<>>( inVector, outVector, - rowPointers, - columnIndexes, - values, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), rows, - getColumns, kernelRowCnt ); else if (nnz <= 4) SpMVCSRLight<<>>( inVector, outVector, - rowPointers, - columnIndexes, - values, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), rows, - getColumns, kernelRowCnt ); else if (nnz <= 64) SpMVCSRLight<<>>( inVector, outVector, - rowPointers, - columnIndexes, - values, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), rows, - getColumns, kernelRowCnt ); else SpMVCSRLight<<>>( inVector, outVector, - rowPointers, - columnIndexes, - values, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), rows, - getColumns, kernelRowCnt ); @@ -1338,21 +1318,19 @@ void SpMVCSRLightPrepare( const Real *inVector, template< typename Real, typename Index, + typename Device, + CSRKernel KernelType, int warpSize, int maxElemPerWarp > void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, Real* outVector, - const Index* rowPointers, - const Index* columnIndexes, - const Real* values, - const Index valuesSize, - const Index rows, - const Index getColumns) { + const CSR< Real, Device, Index, KernelType >& matrix) { + const Index rows = matrix.getRowPointers().getSize() - 1; const Index threads = 1024; // block size size_t neededThreads = rows * warpSize; Index blocks, groupSize; - const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row + const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row if (nnz <= 2) groupSize = 2; else if (nnz <= 4) @@ -1379,33 +1357,51 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, if (groupSize == 2) { SpMVCSRLightWithoutAtomic2<<>>( - inVector, outVector, rowPointers, columnIndexes, values, - rows, getColumns, grid + inVector, outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + rows, grid ); } else if (groupSize == 4) { SpMVCSRLightWithoutAtomic4<<>>( - inVector, outVector, rowPointers, columnIndexes, values, - rows, getColumns, grid + inVector, outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + rows, grid ); } else if (groupSize == 8) { SpMVCSRLightWithoutAtomic8<<>>( - inVector, outVector, rowPointers, columnIndexes, values, - rows, getColumns, grid + inVector, outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + rows, grid ); } else if (groupSize == 16) { SpMVCSRLightWithoutAtomic16<<>>( - inVector, outVector, rowPointers, columnIndexes, values, - rows, getColumns, grid + inVector, outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + rows, grid ); } else if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector SpMVCSRVector<<>>( - inVector, outVector, rowPointers, columnIndexes, values, - rows, getColumns, grid + inVector, outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + rows, grid ); } else { // Execute CSR MultiVector SpMVCSRMultiVector<<>>( - inVector, outVector, rowPointers, columnIndexes, values, - rows, getColumns, groupSize / 32, grid + inVector, outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + rows, groupSize / 32, grid ); } } @@ -1413,22 +1409,18 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, template< typename Real, typename Index, + typename Device, + CSRKernel KernelType, int warpSize, - int maxElemPerWarp> + int maxElemPerWarp > void SpMVCSRMultiVectorPrepare( const Real *inVector, Real* outVector, - const Index* rowPointers, - const Index* columnIndexes, - const Real* values, - const Index valuesSize, - const Index rows, - const Index getColumns) { - /* Configuration */ - //---------------------------------------------------------------------------------- + const CSR< Real, Device, Index, KernelType >& matrix) { + const Index rows = matrix.getRowPointers().getSize() - 1; const Index threads = 1024; // block size Index blocks; - const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row + const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row const Index neededWarps = roundUpDivision(nnz, maxElemPerWarp); // warps per row size_t neededThreads = warpSize * neededWarps * rows; /* Execute kernels on device */ @@ -1445,22 +1437,20 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, SpMVCSRVector<<>>( inVector, outVector, - rowPointers, - columnIndexes, - values, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), rows, - getColumns, grid ); } else { SpMVCSRMultiVector<<>>( inVector, outVector, - rowPointers, - columnIndexes, - values, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), rows, - getColumns, neededWarps, grid ); @@ -1476,13 +1466,7 @@ template< typename Real, int maxElemPerWarp > void SpMVCSRAdaptivePrepare( const Real *inVector, Real* outVector, - const CSR< Real, Device, Index, KernelType >& matrix, - const Index* rowPointers, - const Index* columnIndexes, - const Real* values, - const Index valuesSize, - const Index rows, - const Index getColumns) { + const CSR< Real, Device, Index, KernelType >& matrix) { /* Configuration ---------------------------------------------------*/ /* Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache 512 threads per block for double (12 elements per thread) */ @@ -1508,12 +1492,11 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, SpMVCSRAdaptive<<>>( inVector, outVector, - rowPointers, - columnIndexes, - values, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), matrix.blocks.getData(), matrix.blocks.getSize() - 1, // last block shouldn't be used - getColumns, grid ); } @@ -1672,74 +1655,45 @@ class CSRDeviceDependentCode< Devices::Cuda > switch(KernelType) { case CSRScalar: - SpMVCSRScalarPrepare( + SpMVCSRScalarPrepare( inVector.getData(), outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() + matrix ); break; case CSRVector: - SpMVCSRVectorPrepare( + SpMVCSRVectorPrepare( inVector.getData(), outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() + matrix ); break; case CSRLight: - SpMVCSRLightPrepare( + SpMVCSRLightPrepare( inVector.getData(), outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() + matrix ); break; case CSRAdaptive: SpMVCSRAdaptivePrepare( inVector.getData(), outVector.getData(), - matrix, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getValues().getSize(), - matrix.getRowPointers().getSize(), // don't add -1 ! - matrix.getColumns() + matrix ); break; case CSRMultiVector: - SpMVCSRMultiVectorPrepare( + SpMVCSRMultiVectorPrepare( inVector.getData(), outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() + matrix ); break; case CSRLightWithoutAtomic: - SpMVCSRLightWithoutAtomicPrepare( + SpMVCSRLightWithoutAtomicPrepare( inVector.getData(), outVector.getData(), - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getValues().getSize(), - matrix.getRowPointers().getSize() - 1, - matrix.getColumns() + matrix ); break; } -- GitLab From 93c4dd2dd8b32afbc45dd8b3b950632cb91adcc0 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Fri, 10 Jul 2020 22:41:10 +0200 Subject: [PATCH 36/57] Added different versions of CSR Light --- src/TNL/Matrices/Legacy/CSR.h | 3 +- src/TNL/Matrices/Legacy/CSR_impl.h | 509 ++++++++++++++++++++++------- 2 files changed, 386 insertions(+), 126 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 82a661021..9f7d50e5c 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -50,7 +50,8 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; -enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRLight2, +enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, + CSRLight, CSRLight2, CSRLight3, CSRLight4, CSRLight5, CSRLight6, CSRAdaptive, CSRMultiVector, CSRLightWithoutAtomic }; template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar > diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 21514b7d3..c4cca1564 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -987,10 +987,8 @@ void SpMVCSRLight( const Real *inVector, const Real* values, const Index rows, unsigned *rowCnt) { - Index i; Real sum; - Index row; - Index rowStart, rowEnd; + Index row, i, rowStart, rowEnd; const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/ const Index vectorId = threadIdx.x / groupSize; /*vector index in the thread block*/ const Index warpLaneId = threadIdx.x & 31; /*lane index in the warp*/ @@ -1009,9 +1007,9 @@ void SpMVCSRLight( const Real *inVector, while (row < rows) { /*use two threads to fetch the row offset*/ - if (laneId < 2) { + if (laneId < 2) space[vectorId][laneId] = rowPointers[row + laneId]; - } + rowStart = space[vectorId][0]; rowEnd = space[vectorId][1]; @@ -1024,42 +1022,189 @@ void SpMVCSRLight( const Real *inVector, i = rowStart - (rowStart & (groupSize - 1)) + laneId; /*process the unaligned part*/ - if (i >= rowStart && i < rowEnd) { + if (i >= rowStart && i < rowEnd) sum += values[i] * inVector[columnIndexes[i]]; - } - /*process the aligned part*/ - for (i += groupSize; i < rowEnd; i += groupSize) { + /*process the aligned part*/ + for (i += groupSize; i < rowEnd; i += groupSize) sum += values[i] * inVector[columnIndexes[i]]; - } } else { /*regardless of the global memory access alignment*/ - for (i = rowStart + laneId; i < rowEnd; i += - groupSize) { + for (i = rowStart + laneId; i < rowEnd; i += groupSize) sum += values[i] * inVector[columnIndexes[i]]; - } } /*intra-vector reduction*/ - for (i = groupSize >> 1; i > 0; i >>= 1) { + for (i = groupSize >> 1; i > 0; i >>= 1) sum += __shfl_down_sync(0xFFFFFFFF, sum, i); - } /*save the results and get a new row*/ - if (laneId == 0) { - /*save the results*/ + if (laneId == 0) outVector[row] = sum; - } /*get a new row index*/ - if(warpLaneId == 0){ + if(warpLaneId == 0) row = atomicAdd(rowCnt, 32 / groupSize); + + /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/ + row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; + + }/*while*/ +} + +/* Original CSR Light without shared memory */ +template< typename Real, + typename Index, + int groupSize > +__global__ +void SpMVCSRLight2( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + unsigned *rowCnt) { + Real sum; + Index i, rowStart, rowEnd, row; + const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/ + const Index warpLaneId = threadIdx.x & 31; /*lane index in the warp*/ + const Index warpVectorId = warpLaneId / groupSize; /*vector index in the warp*/ + + /*get the row index*/ + if (warpLaneId == 0) + row = atomicAdd(rowCnt, 32 / groupSize); + + /*broadcast the value to other threads in the same warp and compute the row index of each vector*/ + row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; + + /*check the row range*/ + while (row < rows) { + + rowStart = rowPointers[row]; + rowEnd = rowPointers[row + 1]; + + /*there are non-zero elements in the current row*/ + sum = 0; + /*compute dot product*/ + if (groupSize == 32) { + + /*ensure aligned memory access*/ + i = rowStart - (rowStart & (groupSize - 1)) + laneId; + + /*process the unaligned part*/ + if (i >= rowStart && i < rowEnd) + sum += values[i] * inVector[columnIndexes[i]]; + + /*process the aligned part*/ + for (i += groupSize; i < rowEnd; i += groupSize) + sum += values[i] * inVector[columnIndexes[i]]; + } else { + /*regardless of the global memory access alignment*/ + for (i = rowStart + laneId; i < rowEnd; i += groupSize) + sum += values[i] * inVector[columnIndexes[i]]; } + /*intra-vector reduction*/ + for (i = groupSize >> 1; i > 0; i >>= 1) + sum += __shfl_down_sync(0xFFFFFFFF, sum, i); + + /*save the results and get a new row*/ + if (laneId == 0) + outVector[row] = sum; + + /*get a new row index*/ + if(warpLaneId == 0) + row = atomicAdd(rowCnt, 32 / groupSize); + + /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/ + row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; + + }/*while*/ +} + +/* Original CSR Light without shared memory and allign memory access */ +template< typename Real, + typename Index, + int groupSize > +__global__ +void SpMVCSRLight3( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + unsigned *rowCnt) { + Real sum; + Index i, rowEnd, row; + const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/ + const Index warpLaneId = threadIdx.x & 31; /*lane index in the warp*/ + const Index warpVectorId = warpLaneId / groupSize; /*vector index in the warp*/ + + /*get the row index*/ + if (warpLaneId == 0) + row = atomicAdd(rowCnt, 32 / groupSize); + + /*broadcast the value to other threads in the same warp and compute the row index of each vector*/ + row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; + + /*check the row range*/ + while (row < rows) { + sum = 0; + + /*compute dot product*/ + rowEnd = rowPointers[row + 1]; + for (i = rowPointers[row] + laneId; i < rowEnd; i += groupSize) + sum += values[i] * inVector[columnIndexes[i]]; + + /*intra-vector reduction*/ + for (i = groupSize >> 1; i > 0; i >>= 1) + sum += __shfl_down_sync(0xFFFFFFFF, sum, i); + + /*save the results and get a new row*/ + if (laneId == 0) + outVector[row] = sum; + + /*get a new row index*/ + if(warpLaneId == 0) + row = atomicAdd(rowCnt, 32 / groupSize); + /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/ row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; }/*while*/ } +/* Original CSR Light without shared memory, allign memory access and atomic instructions */ +template< typename Real, + typename Index, + int groupSize > +__global__ +void SpMVCSRLight4( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index gridID) { + const Index row = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / groupSize; + if (row >= rows) + return; + + Real sum = 0; + Index i; + const Index laneId = threadIdx.x & (groupSize - 1); /*lane index in the group*/ + + /*compute dot product*/ + const Index rowEnd = rowPointers[row + 1]; + for (i = rowPointers[row] + laneId; i < rowEnd; i += groupSize) + sum += values[i] * inVector[columnIndexes[i]]; + + /*intra-vector reduction*/ + for (i = groupSize >> 1; i > 0; i >>= 1) + sum += __shfl_down_sync(0xFFFFFFFF, sum, i); + + /*save the results and get a new row*/ + if (laneId == 0) outVector[row] = sum; +} + template< typename Real, typename Index> __global__ @@ -1272,46 +1417,113 @@ void SpMVCSRLightPrepare( const Real *inVector, properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads; const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row - if (nnz <= 2) - SpMVCSRLight<<>>( - inVector, - outVector, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - rows, - kernelRowCnt - ); - else if (nnz <= 4) - SpMVCSRLight<<>>( - inVector, - outVector, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - rows, - kernelRowCnt - ); - else if (nnz <= 64) - SpMVCSRLight<<>>( - inVector, - outVector, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - rows, - kernelRowCnt - ); - else - SpMVCSRLight<<>>( - inVector, - outVector, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - rows, - kernelRowCnt - ); + if (KernelType == CSRLight) { //----------------------------------------- + if (nnz <= 2) + SpMVCSRLight<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 4) + SpMVCSRLight<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 64) + SpMVCSRLight<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else + SpMVCSRLight<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + } else if(KernelType == CSRLight2) { //----------------------------------------- + if (nnz <= 2) + SpMVCSRLight2<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 4) + SpMVCSRLight2<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 64) + SpMVCSRLight2<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else + SpMVCSRLight2<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + } else if(KernelType == CSRLight3) { //----------------------------------------- + if (nnz <= 2) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 4) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 64) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + } else if(KernelType == CSRLight6) { //----------------------------------------- + if (nnz <= 2) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 4) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 8) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 16) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + } cudaFree(kernelRowCnt); } @@ -1355,54 +1567,108 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, neededThreads -= MAX_X_DIM * threads; } - if (groupSize == 2) { - SpMVCSRLightWithoutAtomic2<<>>( - inVector, outVector, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - rows, grid - ); - } else if (groupSize == 4) { - SpMVCSRLightWithoutAtomic4<<>>( - inVector, outVector, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - rows, grid - ); - } else if (groupSize == 8) { - SpMVCSRLightWithoutAtomic8<<>>( - inVector, outVector, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - rows, grid - ); - } else if (groupSize == 16) { - SpMVCSRLightWithoutAtomic16<<>>( - inVector, outVector, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - rows, grid - ); - } else if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector - SpMVCSRVector<<>>( - inVector, outVector, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - rows, grid - ); - } else { // Execute CSR MultiVector - SpMVCSRMultiVector<<>>( - inVector, outVector, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - rows, groupSize / 32, grid - ); + if (KernelType == CSRLightWithoutAtomic) { //----------------------------------------- + if (groupSize == 2) { + SpMVCSRLightWithoutAtomic2<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 4) { + SpMVCSRLightWithoutAtomic4<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 8) { + SpMVCSRLightWithoutAtomic8<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 16) { + SpMVCSRLightWithoutAtomic16<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector + SpMVCSRVector<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else { // Execute CSR MultiVector + SpMVCSRMultiVector<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, groupSize / 32, grid + ); + } + } else if (KernelType == CSRLight5) { //----------------------------------------- + if (groupSize == 2) { + SpMVCSRLightWithoutAtomic2<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 4) { + SpMVCSRLightWithoutAtomic4<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 8) { + SpMVCSRLightWithoutAtomic8<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 16) { + SpMVCSRLightWithoutAtomic16<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else { // CSR SpMV Light with groupsize = 32 is CSR Vector + SpMVCSRVector<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } + } else if (KernelType == CSRLight4) { //----------------------------------------- + if (groupSize == 2) { + SpMVCSRLight4<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 4) { + SpMVCSRLight4<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 8) { + SpMVCSRLight4<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 16) { + SpMVCSRLight4<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else { // CSR SpMV Light with groupsize = 32 is CSR Vector + SpMVCSRVector<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } //----------------------------------------- } } } @@ -1656,44 +1922,37 @@ class CSRDeviceDependentCode< Devices::Cuda > { case CSRScalar: SpMVCSRScalarPrepare( - inVector.getData(), - outVector.getData(), - matrix + inVector.getData(), outVector.getData(), matrix ); break; case CSRVector: SpMVCSRVectorPrepare( - inVector.getData(), - outVector.getData(), - matrix + inVector.getData(), outVector.getData(), matrix ); break; case CSRLight: + case CSRLight2: + case CSRLight3: + case CSRLight6: SpMVCSRLightPrepare( - inVector.getData(), - outVector.getData(), - matrix + inVector.getData(), outVector.getData(), matrix ); break; case CSRAdaptive: SpMVCSRAdaptivePrepare( - inVector.getData(), - outVector.getData(), - matrix + inVector.getData(), outVector.getData(), matrix ); break; case CSRMultiVector: SpMVCSRMultiVectorPrepare( - inVector.getData(), - outVector.getData(), - matrix + inVector.getData(), outVector.getData(), matrix ); break; + case CSRLight4: + case CSRLight5: case CSRLightWithoutAtomic: SpMVCSRLightWithoutAtomicPrepare( - inVector.getData(), - outVector.getData(), - matrix + inVector.getData(), outVector.getData(), matrix ); break; } -- GitLab From 4f3f609209c2b270a729700137464d20021d5199 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sat, 11 Jul 2020 00:01:35 +0200 Subject: [PATCH 37/57] Added different versions of CSR Light to script and benchmark --- src/Benchmarks/SpMV/spmv-legacy.h | 20 +++++++ .../scripts/tnl-spmv-benchmark-make-tables.py | 55 +++++++++++++++++++ src/TNL/Matrices/Legacy/CSR_impl.h | 6 +- src/TNL/Matrices/MatrixInfo.h | 40 ++++++++++++++ 4 files changed, 120 insertions(+), 1 deletion(-) diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h index 30f702ae1..838165039 100644 --- a/src/Benchmarks/SpMV/spmv-legacy.h +++ b/src/Benchmarks/SpMV/spmv-legacy.h @@ -85,6 +85,21 @@ using SparseMatrixLegacy_CSR_Vector = Matrices::Legacy::CSR< Real, Device, Index template< typename Real, typename Device, typename Index > using SparseMatrixLegacy_CSR_Light = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight >; +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Light2 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight2 >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Light3 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight3 >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Light4 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight4 >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Light5 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight5 >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Light6 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight6 >; + template< typename Real, typename Device, typename Index > using SparseMatrixLegacy_CSR_Adaptive = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRAdaptive >; @@ -297,6 +312,11 @@ benchmarkSpmvSynthetic( Benchmark& benchmark, benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light2 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light3 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light4 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light5 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light6 >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrixLegacy_CSR_MultiVector>( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic>( benchmark, hostOutVector, inputFileName, verboseMR ); diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py index a11a40a08..b88cac8c8 100755 --- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py +++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py @@ -87,6 +87,11 @@ df["BiEllpacky", "GPU", "cuSparse speedup"] = df["BiEllpack", df["CSR", "GPU", "cuSparse speedup"] = df["CSR", "GPU", "time"] / df["cuSparse", "GPU", "time"] df["CSR Legacy Adaptive", "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive", "GPU", "time"] / df["cuSparse", "GPU", "time"] df["CSR Legacy Light", "GPU", "cuSparse speedup"] = df["CSR Legacy Light", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Light2", "GPU", "cuSparse speedup"] = df["CSR Legacy Light2", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Light3", "GPU", "cuSparse speedup"] = df["CSR Legacy Light3", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Light4", "GPU", "cuSparse speedup"] = df["CSR Legacy Light4", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Light5", "GPU", "cuSparse speedup"] = df["CSR Legacy Light5", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Light6", "GPU", "cuSparse speedup"] = df["CSR Legacy Light6", "GPU", "time"] / df["cuSparse", "GPU", "time"] df["CSR Legacy LightWithoutAtomic", "GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic", "GPU", "time"] / df["cuSparse", "GPU", "time"] df["CSR Legacy Scalar", "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar", "GPU", "time"] / df["cuSparse", "GPU", "time"] df["CSR Legacy Vector", "GPU", "cuSparse speedup"] = df["CSR Legacy Vector", "GPU", "time"] / df["cuSparse", "GPU", "time"] @@ -121,6 +126,11 @@ print( "Preparing data for graph analysis..." ) df['cuSparse-bandwidth' ] = df[ 'cuSparse','GPU','bandwidth'] df['csr-legacy-adaptive-bandwidth' ] = df[ 'CSR Legacy Adaptive','GPU','bandwidth'] df['csr-legacy-light-bandwidth' ] = df[ 'CSR Legacy Light','GPU','bandwidth'] +df['csr-legacy-light2-bandwidth' ] = df[ 'CSR Legacy Light2','GPU','bandwidth'] +df['csr-legacy-light3-bandwidth' ] = df[ 'CSR Legacy Light3','GPU','bandwidth'] +df['csr-legacy-light4-bandwidth' ] = df[ 'CSR Legacy Light4','GPU','bandwidth'] +df['csr-legacy-light5-bandwidth' ] = df[ 'CSR Legacy Light5','GPU','bandwidth'] +df['csr-legacy-light6-bandwidth' ] = df[ 'CSR Legacy Light6','GPU','bandwidth'] df['csr-legacy-light-without-atomic-bandwidth' ] = df[ 'CSR Legacy LightWithoutAtomic','GPU','bandwidth'] df['csr-legacy-scalar-bandwidth' ] = df[ 'CSR Legacy Scalar','GPU','bandwidth'] df['csr-legacy-vector-bandwidth' ] = df[ 'CSR Legacy Vector','GPU','bandwidth'] @@ -135,6 +145,11 @@ df.sort_values(by=["cuSparse-bandwidth"],inplace=True,ascending=False) cuSparse_list = df['cuSparse-bandwidth'].tolist() cuSparse_csr_legacy_adaptive_gpu_list = df[ "CSR Legacy Adaptive", "GPU", "bandwidth"].tolist(); cuSparse_csr_legacy_light_gpu_list = df[ "CSR Legacy Light", "GPU", "bandwidth"].tolist(); +cuSparse_csr_legacy_light2_gpu_list = df[ "CSR Legacy Light2", "GPU", "bandwidth"].tolist(); +cuSparse_csr_legacy_light3_gpu_list = df[ "CSR Legacy Light3", "GPU", "bandwidth"].tolist(); +cuSparse_csr_legacy_light4_gpu_list = df[ "CSR Legacy Light4", "GPU", "bandwidth"].tolist(); +cuSparse_csr_legacy_light5_gpu_list = df[ "CSR Legacy Light5", "GPU", "bandwidth"].tolist(); +cuSparse_csr_legacy_light6_gpu_list = df[ "CSR Legacy Light6", "GPU", "bandwidth"].tolist(); cuSparse_csr_legacy_light_without_atomic_gpu_list = df[ "CSR Legacy LightWithoutAtomic", "GPU", "bandwidth"].tolist(); cuSparse_csr_legacy_scalar_gpu_list = df[ "CSR Legacy Scalar", "GPU", "bandwidth"].tolist(); cuSparse_csr_legacy_vector_gpu_list = df[ "CSR Legacy Vector", "GPU", "bandwidth"].tolist(); @@ -179,6 +194,11 @@ for x in cuSparse_list: if str( x ) != "nan": if ( str( cuSparse_csr_legacy_adaptive_gpu_list[ i ] ) != "nan" and str( cuSparse_csr_legacy_light_gpu_list[ i ] ) != "nan" and + str( cuSparse_csr_legacy_light2_gpu_list[ i ] ) != "nan" and + str( cuSparse_csr_legacy_light3_gpu_list[ i ] ) != "nan" and + str( cuSparse_csr_legacy_light4_gpu_list[ i ] ) != "nan" and + str( cuSparse_csr_legacy_light5_gpu_list[ i ] ) != "nan" and + str( cuSparse_csr_legacy_light6_gpu_list[ i ] ) != "nan" and str( cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ] ) != "nan" and str( cuSparse_csr_legacy_scalar_gpu_list[ i ] ) != "nan" and str( cuSparse_csr_legacy_vector_gpu_list[ i ] ) != "nan" and @@ -194,6 +214,11 @@ for x in cuSparse_list: cuSparse_file.write( f"{i+1} {x} " ) # 1 2 cuSparse_file.write( f"{cuSparse_csr_legacy_adaptive_gpu_list[ i ]} " ) # 3 cuSparse_file.write( f"{cuSparse_csr_legacy_light_gpu_list[ i ]} " ) # 4 + cuSparse_file.write( f"{cuSparse_csr_legacy_light2_gpu_list[ i ]} " ) # 4 + cuSparse_file.write( f"{cuSparse_csr_legacy_light3_gpu_list[ i ]} " ) # 4 + cuSparse_file.write( f"{cuSparse_csr_legacy_light4_gpu_list[ i ]} " ) # 4 + cuSparse_file.write( f"{cuSparse_csr_legacy_light5_gpu_list[ i ]} " ) # 4 + cuSparse_file.write( f"{cuSparse_csr_legacy_light6_gpu_list[ i ]} " ) # 4 cuSparse_file.write( f"{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " ) # 5 cuSparse_file.write( f"{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " ) # 6 cuSparse_file.write( f"{cuSparse_csr_legacy_vector_gpu_list[ i ]} " ) # 7 @@ -261,6 +286,31 @@ plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ 'cusparse.gplt' using 1:4 title 'CSR Legacy Light' with lines linewidth 0.5 lt rgb 'green', +set output 'csr-legacy-light2-vs-cusparse.eps' +plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ + 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ + 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:4 title 'CSR Legacy Light2' with lines linewidth 0.5 lt rgb 'green', +set output 'csr-legacy-light3-vs-cusparse.eps' +plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ + 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ + 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:4 title 'CSR Legacy Light3' with lines linewidth 0.5 lt rgb 'green', +set output 'csr-legacy-light4-vs-cusparse.eps' +plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ + 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ + 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:4 title 'CSR Legacy Light4' with lines linewidth 0.5 lt rgb 'green', +set output 'csr-legacy-light5-vs-cusparse.eps' +plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ + 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ + 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:4 title 'CSR Legacy Light5' with lines linewidth 0.5 lt rgb 'green', +set output 'csr-legacy-light6-vs-cusparse.eps' +plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ + 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ + 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:4 title 'CSR Legacy Light6' with lines linewidth 0.5 lt rgb 'green', set output 'csr-legacy-light-without-atomic-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ @@ -338,6 +388,11 @@ os.system( "gnuplot gnuplot.gplt" ) print( "Converting files to PDF ..." ) os.system( "epstopdf --autorotate All csr-legacy-adaptive-vs-cusparse.eps" ) os.system( "epstopdf --autorotate All csr-legacy-light-vs-cusparse.eps" ) +os.system( "epstopdf --autorotate All csr-legacy-light2-vs-cusparse.eps" ) +os.system( "epstopdf --autorotate All csr-legacy-light3-vs-cusparse.eps" ) +os.system( "epstopdf --autorotate All csr-legacy-light4-vs-cusparse.eps" ) +os.system( "epstopdf --autorotate All csr-legacy-light5-vs-cusparse.eps" ) +os.system( "epstopdf --autorotate All csr-legacy-light6-vs-cusparse.eps" ) os.system( "epstopdf --autorotate All csr-legacy-light-without-atomic-vs-cusparse.eps" ) os.system( "epstopdf --autorotate All csr-legacy-scalar-vs-cusparse.eps" ) os.system( "epstopdf --autorotate All csr-legacy-vector-vs-cusparse.eps" ) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index c4cca1564..8c53e59a5 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -1556,7 +1556,11 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, else groupSize = roundUpDivision(nnz, maxElemPerWarp) * 32; // CSR MultiVector - neededThreads = groupSize * rows; + if (KernelType == CSRLightWithoutAtomic) + neededThreads = groupSize * rows; + else + neededThreads = rows * (groupSize > 32 ? 32 : groupSize); + /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h index fa39bfdda..297981735 100644 --- a/src/TNL/Matrices/MatrixInfo.h +++ b/src/TNL/Matrices/MatrixInfo.h @@ -113,6 +113,46 @@ struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight > > static String getFormat() { return "CSR Legacy Light"; }; }; +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight2 > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Light2"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight3 > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Light3"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight4 > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Light4"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight5 > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Light5"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight6 > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Light5"; }; +}; + template< typename Real, typename Device, typename Index > struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRAdaptive > > { -- GitLab From 25fb5ea9bd5117ed7e6e3b8bc53078bf8d3da042 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sat, 11 Jul 2020 01:10:16 +0200 Subject: [PATCH 38/57] Added copying of blocks in CSR matrix --- src/TNL/Matrices/Legacy/CSR_impl.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 8c53e59a5..0885711c3 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -113,7 +113,7 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr this->columnIndexes.setSize( this->rowPointers.getElement( this->rows ) ); this->columnIndexes.setValue( this->columns ); - if (KernelType == CSRAdaptive) + if (KernelType == CSRAdaptive && this->blocks.empty()) this->setBlocks(); } @@ -665,6 +665,7 @@ CSR< Real, Device, Index, KernelType >::operator=( const CSR& matrix ) this->values = matrix.values; this->columnIndexes = matrix.columnIndexes; this->rowPointers = matrix.rowPointers; + this->blocks = matrix.blocks; return *this; } @@ -681,6 +682,7 @@ CSR< Real, Device, Index, KernelType >::operator=( const CSR< Real2, Device2, In this->values = matrix.values; this->columnIndexes = matrix.columnIndexes; this->rowPointers = matrix.rowPointers; + this->blocks = matrix.blocks; return *this; } -- GitLab From f73fa26856d1ac437abebcd3aeba94883ee45302 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sat, 11 Jul 2020 01:14:37 +0200 Subject: [PATCH 39/57] Added default contructor for Block union --- src/TNL/Matrices/Legacy/CSR.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 9f7d50e5c..5fdec9646 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -35,6 +35,8 @@ union Block { this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; } + Block() = default; + Index index[2]; // index[0] is row pointer, index[1] is index in warp uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator }; -- GitLab From 346d1f86f0b4d28d7270c8fac85c829e9a6568e3 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sat, 11 Jul 2020 01:41:35 +0200 Subject: [PATCH 40/57] Bug fix for benchmark --- src/TNL/Matrices/MatrixInfo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h index 297981735..432584d27 100644 --- a/src/TNL/Matrices/MatrixInfo.h +++ b/src/TNL/Matrices/MatrixInfo.h @@ -150,7 +150,7 @@ struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight6 > > { static String getDensity() { return String( "sparse" ); }; - static String getFormat() { return "CSR Legacy Light5"; }; + static String getFormat() { return "CSR Legacy Light6"; }; }; template< typename Real, typename Device, typename Index > -- GitLab From 5f2cb1655966df1aeb9d93c388542d5874d700eb Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sun, 12 Jul 2020 02:04:18 +0200 Subject: [PATCH 41/57] Optimizations for CSR Adaptive, code cleaning --- src/TNL/Matrices/Legacy/CSR.h | 40 ++++++++++++- src/TNL/Matrices/Legacy/CSR_impl.h | 96 ++++++++++++++---------------- 2 files changed, 85 insertions(+), 51 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 5fdec9646..e08d28699 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -35,10 +35,25 @@ union Block { this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; } + Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { + this->index[0] = row; + this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; + + if (type == Type::STREAM) + this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; + + if (type == Type::STREAM) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000; + else if (type == Type::VECTOR) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b100000; + } + Block() = default; Index index[2]; // index[0] is row pointer, index[1] is index in warp uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator + uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID + //twobytes[3/5] is nextRow - row }; #ifdef HAVE_UMFPACK @@ -91,7 +106,30 @@ public: Containers::Vector< Block, Device, Index > blocks; - Index maxElementsPerWarp = 1024; + /* Configuration of SpMV kernels ------------------------------------------- */ + + /* Block sizes */ + + // Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache + // 512 threads per block for double (12 elements per thread) + static constexpr Index THREADS_ADAPTIVE = sizeof(Real) == 4 ? 1024 : 512; + static constexpr Index THREADS_SCALAR = 1024; + static constexpr Index THREADS_VECTOR = 1024; + static constexpr Index THREADS_LIGHT = 1024; + + /* Max length of row to process one warp */ + static constexpr Index MAX_ELEMENTS_PER_WARP = 1024; + + /* How many shared memory use per block in CSR Adaptive kernel */ + static constexpr Index SHARED_PER_BLOCK = 49152; + + /* Number of elements in shared memory */ + static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); + + /* Number of elements in shared memory per one warp */ + static constexpr Index SHARED_PER_WARP = SHARED / (THREADS_ADAPTIVE / 32); + /* -------------------------------------------------------------------------- */ + using Sparse< Real, Device, Index >::getAllocatedElementsCount; diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 0885711c3..d6a26141c 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -23,10 +23,7 @@ #include #endif - -/* Configuration */ constexpr size_t MAX_X_DIM = 2147483647; -//----------------------------------------------------------------- namespace TNL { namespace Matrices { @@ -122,10 +119,9 @@ template< typename Real, typename Index, typename Device, CSRKernel KernelType> -Index findLimit(const Index start, const Index max, +Index findLimit(const Index start, const CSR< Real, Device, Index, KernelType >& matrix, const Index size, - const Index maxElemPerWarp, Type &type, Index &sum) { sum = 0; @@ -133,12 +129,12 @@ Index findLimit(const Index start, const Index max, Index elements = matrix.getRowPointers().getElement(current + 1) - matrix.getRowPointers().getElement(current); sum += elements; - if (sum > max) { + if (sum > matrix.SHARED_PER_WARP) { if (current - start > 1) { // extra row type = Type::STREAM; return current; } else { // one long row - if (sum <= maxElemPerWarp) + if (sum <= matrix.MAX_ELEMENTS_PER_WARP) type = Type::VECTOR; else type = Type::LONG; @@ -167,7 +163,7 @@ void CSR< Real, Device, Index, KernelType >::setBlocks() while (nextStart != rows - 1) { Type type; nextStart = findLimit( - start, 384, *this, rows, this->maxElementsPerWarp, type, sum + start, *this, rows, type, sum ); if (type == Type::LONG) { Index parts = roundUpDivision(sum, 384); @@ -175,7 +171,11 @@ void CSR< Real, Device, Index, KernelType >::setBlocks() inBlock.emplace_back(start, Type::LONG, index); } } else { - inBlock.emplace_back(start, type); + inBlock.emplace_back(start, type, + nextStart, + this->rowPointers.getElement(nextStart), + this->rowPointers.getElement(start) + ); } start = nextStart; @@ -804,8 +804,9 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const template< typename Real, typename Index, int warpSize, - int sharedPerWarp, - int maxElemPerWarp > + int SHARED, + int SHARED_PER_WARP, + int MAX_ELEM_PER_WARP > __global__ void SpMVCSRAdaptive( const Real *inVector, Real *outVector, @@ -815,26 +816,27 @@ void SpMVCSRAdaptive( const Real *inVector, const Block *blocks, Index blocksSize, Index gridID) { - __shared__ Real shared_res[49152/sizeof(Real)]; + __shared__ Real shared[SHARED]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if (blockIdx >= blocksSize) return; - Block block = blocks[blockIdx]; Real result = 0.0; const Index laneID = threadIdx.x & 31; // & is cheaper than % + Block block = blocks[blockIdx]; const Index minID = rowPointers[block.index[0]/* minRow */]; Index i, to, offset, maxID; - if (block.byte[sizeof(Index) == 4 ? 7 : 15] == 1) { + if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000) { /////////////////////////////////////* CSR STREAM *////////////// - const Index maxRow = blocks[blockIdx + 1].index[0]; - maxID = rowPointers[maxRow]; + const Index maxRow = block.index[0]/* minRow */ + + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FF); + maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; /* offset between shared and global addresses */ - offset = minID - (threadIdx.x / warpSize * sharedPerWarp); + offset = minID - (threadIdx.x / warpSize * SHARED_PER_WARP); /* Copy and calculate elements from global to shared memory, coalesced */ for (i = laneID + minID; i < maxID; i += warpSize) - shared_res[i - offset] = values[i] * inVector[columnIndexes[i]]; + shared[i - offset] = values[i] * inVector[columnIndexes[i]]; /* Calculate result */ for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { @@ -842,13 +844,13 @@ void SpMVCSRAdaptive( const Real *inVector, result = 0; /* Scalar reduction */ for (Index sharedID = rowPointers[i] - offset; sharedID < to; ++sharedID) - result += shared_res[sharedID]; + result += shared[sharedID]; outVector[i] = result; // Write result } - } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] == 2) { + } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b100000) { /////////////////////////////////////* CSR VECTOR *////////////// - maxID = rowPointers[block.index[0]/* minRow */ + 1]; + maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; for (i = minID + laneID; i < maxID; i += warpSize) result += values[i] * inVector[columnIndexes[i]]; @@ -864,8 +866,8 @@ void SpMVCSRAdaptive( const Real *inVector, /////////////////////////////////////* CSR VECTOR L *///////////// maxID = rowPointers[block.index[0]/* minRow */ + 1]; - offset = block.index[1]/* warpInRow */ * maxElemPerWarp; - to = minID + (block.index[1]/* warpInRow */ + 1) * maxElemPerWarp; + offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; + to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP; if (to > maxID) to = maxID; for (i = minID + offset + laneID; i < to; i += warpSize) result += values[i] * inVector[columnIndexes[i]]; @@ -1339,7 +1341,7 @@ template< typename Real, void SpMVCSRScalarPrepare( const Real *inVector, Real* outVector, const CSR< Real, Device, Index, KernelType >& matrix) { - const Index threads = 1024; // block size + const Index threads = matrix.THREADS_SCALAR; // block size size_t neededThreads = matrix.getRowPointers().getSize() - 1; Index blocks; /* Execute kernels on device */ @@ -1372,7 +1374,7 @@ template< typename Real, void SpMVCSRVectorPrepare( const Real *inVector, Real* outVector, const CSR< Real, Device, Index, KernelType >& matrix) { - const Index threads = 1024; // block size + const Index threads = matrix.THREADS_VECTOR; // block size size_t neededThreads = matrix.getRowPointers().getSize() * warpSize; Index blocks; /* Execute kernels on device */ @@ -1405,7 +1407,7 @@ template< typename Real, void SpMVCSRLightPrepare( const Real *inVector, Real* outVector, const CSR< Real, Device, Index, KernelType >& matrix) { - const Index threads = 1024; // max block size + const Index threads = matrix.THREADS_LIGHT; // max block size const Index rows = matrix.getRowPointers().getSize() - 1; /* Copy rowCnt to GPU */ unsigned rowCnt = 0; @@ -1534,13 +1536,12 @@ template< typename Real, typename Index, typename Device, CSRKernel KernelType, - int warpSize, - int maxElemPerWarp > + int warpSize> void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, Real* outVector, const CSR< Real, Device, Index, KernelType >& matrix) { const Index rows = matrix.getRowPointers().getSize() - 1; - const Index threads = 1024; // block size + const Index threads = matrix.THREADS_LIGHT; // block size size_t neededThreads = rows * warpSize; Index blocks, groupSize; @@ -1553,10 +1554,10 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, groupSize = 8; else if (nnz <= 16) groupSize = 16; - else if (nnz <= maxElemPerWarp) + else if (nnz <= matrix.MAX_ELEMENTS_PER_WARP) groupSize = 32; // CSR Vector else - groupSize = roundUpDivision(nnz, maxElemPerWarp) * 32; // CSR MultiVector + groupSize = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector if (KernelType == CSRLightWithoutAtomic) neededThreads = groupSize * rows; @@ -1683,17 +1684,16 @@ template< typename Real, typename Index, typename Device, CSRKernel KernelType, - int warpSize, - int maxElemPerWarp > + int warpSize> void SpMVCSRMultiVectorPrepare( const Real *inVector, Real* outVector, const CSR< Real, Device, Index, KernelType >& matrix) { const Index rows = matrix.getRowPointers().getSize() - 1; - const Index threads = 1024; // block size + const Index threads = matrix.THREADS_VECTOR; // block size Index blocks; const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row - const Index neededWarps = roundUpDivision(nnz, maxElemPerWarp); // warps per row + const Index neededWarps = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP); // warps per row size_t neededThreads = warpSize * neededWarps * rows; /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { @@ -1734,23 +1734,15 @@ template< typename Real, typename Index, typename Device, CSRKernel KernelType, - int warpSize, - int maxElemPerWarp > + int warpSize> void SpMVCSRAdaptivePrepare( const Real *inVector, Real* outVector, const CSR< Real, Device, Index, KernelType >& matrix) { - /* Configuration ---------------------------------------------------*/ - /* Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache - 512 threads per block for double (12 elements per thread) */ - constexpr Index THREADS_PER_BLOCK = sizeof(Real) == 4 ? 1024 : 512; - constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32; - constexpr Index SHARED_PER_WARP = 49152/sizeof(Real) / WARPS_PER_BLOCK; - //-------------------------------------------------------------------- Index blocks; - const Index threads = THREADS_PER_BLOCK; + const Index threads = matrix.THREADS_ADAPTIVE; /* Fill blocks */ - size_t neededThreads = matrix.blocks.getSize() * 32; // one warp per block + size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { @@ -1761,7 +1753,11 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, neededThreads -= MAX_X_DIM * threads; } - SpMVCSRAdaptive<<>>( + SpMVCSRAdaptive< Real, Index, warpSize, + matrix.SHARED, + matrix.SHARED_PER_WARP, + matrix.MAX_ELEMENTS_PER_WARP > + <<>>( inVector, outVector, matrix.getRowPointers().getData(), @@ -1945,19 +1941,19 @@ class CSRDeviceDependentCode< Devices::Cuda > ); break; case CSRAdaptive: - SpMVCSRAdaptivePrepare( + SpMVCSRAdaptivePrepare( inVector.getData(), outVector.getData(), matrix ); break; case CSRMultiVector: - SpMVCSRMultiVectorPrepare( + SpMVCSRMultiVectorPrepare( inVector.getData(), outVector.getData(), matrix ); break; case CSRLight4: case CSRLight5: case CSRLightWithoutAtomic: - SpMVCSRLightWithoutAtomicPrepare( + SpMVCSRLightWithoutAtomicPrepare( inVector.getData(), outVector.getData(), matrix ); break; -- GitLab From cf04e9e941fa4482fa0e7359d0a8fd0d2cd608fc Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Tue, 14 Jul 2020 22:21:36 +0200 Subject: [PATCH 42/57] Fix for CSR Adaptive --- src/TNL/Matrices/Legacy/CSR.h | 4 ++-- src/TNL/Matrices/Legacy/CSR_impl.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index e08d28699..52528b0fd 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -43,9 +43,9 @@ union Block { this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; if (type == Type::STREAM) - this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000; + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; else if (type == Type::VECTOR) - this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b100000; + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; } Block() = default; diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index d6a26141c..efde670d9 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -827,10 +827,10 @@ void SpMVCSRAdaptive( const Real *inVector, Block block = blocks[blockIdx]; const Index minID = rowPointers[block.index[0]/* minRow */]; Index i, to, offset, maxID; - if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000) { + if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { /////////////////////////////////////* CSR STREAM *////////////// const Index maxRow = block.index[0]/* minRow */ + - /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FF); + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; /* offset between shared and global addresses */ offset = minID - (threadIdx.x / warpSize * SHARED_PER_WARP); @@ -848,7 +848,7 @@ void SpMVCSRAdaptive( const Real *inVector, outVector[i] = result; // Write result } - } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b100000) { + } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) { /////////////////////////////////////* CSR VECTOR *////////////// maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; -- GitLab From 1e1a974de455939b4ad4c04458a74a06e641a833 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 13 Jul 2020 13:28:05 +0200 Subject: [PATCH 43/57] Fixed Python SpMV benchmark script. --- .../scripts/tnl-spmv-benchmark-make-tables.py | 114 +++++++++--------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py index b88cac8c8..1e897d6aa 100755 --- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py +++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py @@ -87,11 +87,11 @@ df["BiEllpacky", "GPU", "cuSparse speedup"] = df["BiEllpack", df["CSR", "GPU", "cuSparse speedup"] = df["CSR", "GPU", "time"] / df["cuSparse", "GPU", "time"] df["CSR Legacy Adaptive", "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive", "GPU", "time"] / df["cuSparse", "GPU", "time"] df["CSR Legacy Light", "GPU", "cuSparse speedup"] = df["CSR Legacy Light", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Light2", "GPU", "cuSparse speedup"] = df["CSR Legacy Light2", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Light3", "GPU", "cuSparse speedup"] = df["CSR Legacy Light3", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Light4", "GPU", "cuSparse speedup"] = df["CSR Legacy Light4", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Light5", "GPU", "cuSparse speedup"] = df["CSR Legacy Light5", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Light6", "GPU", "cuSparse speedup"] = df["CSR Legacy Light6", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Light2", "GPU", "cuSparse speedup"] = df["CSR Legacy Light2", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Light3", "GPU", "cuSparse speedup"] = df["CSR Legacy Light3", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Light4", "GPU", "cuSparse speedup"] = df["CSR Legacy Light4", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Light5", "GPU", "cuSparse speedup"] = df["CSR Legacy Light5", "GPU", "time"] / df["cuSparse", "GPU", "time"] +df["CSR Legacy Light6", "GPU", "cuSparse speedup"] = df["CSR Legacy Light6", "GPU", "time"] / df["cuSparse", "GPU", "time"] df["CSR Legacy LightWithoutAtomic", "GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic", "GPU", "time"] / df["cuSparse", "GPU", "time"] df["CSR Legacy Scalar", "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar", "GPU", "time"] / df["cuSparse", "GPU", "time"] df["CSR Legacy Vector", "GPU", "cuSparse speedup"] = df["CSR Legacy Vector", "GPU", "time"] / df["cuSparse", "GPU", "time"] @@ -126,11 +126,11 @@ print( "Preparing data for graph analysis..." ) df['cuSparse-bandwidth' ] = df[ 'cuSparse','GPU','bandwidth'] df['csr-legacy-adaptive-bandwidth' ] = df[ 'CSR Legacy Adaptive','GPU','bandwidth'] df['csr-legacy-light-bandwidth' ] = df[ 'CSR Legacy Light','GPU','bandwidth'] -df['csr-legacy-light2-bandwidth' ] = df[ 'CSR Legacy Light2','GPU','bandwidth'] -df['csr-legacy-light3-bandwidth' ] = df[ 'CSR Legacy Light3','GPU','bandwidth'] -df['csr-legacy-light4-bandwidth' ] = df[ 'CSR Legacy Light4','GPU','bandwidth'] -df['csr-legacy-light5-bandwidth' ] = df[ 'CSR Legacy Light5','GPU','bandwidth'] -df['csr-legacy-light6-bandwidth' ] = df[ 'CSR Legacy Light6','GPU','bandwidth'] +df['csr-legacy-light2-bandwidth' ] = df[ 'CSR Legacy Light2','GPU','bandwidth'] +df['csr-legacy-light3-bandwidth' ] = df[ 'CSR Legacy Light3','GPU','bandwidth'] +df['csr-legacy-light4-bandwidth' ] = df[ 'CSR Legacy Light4','GPU','bandwidth'] +df['csr-legacy-light5-bandwidth' ] = df[ 'CSR Legacy Light5','GPU','bandwidth'] +df['csr-legacy-light6-bandwidth' ] = df[ 'CSR Legacy Light6','GPU','bandwidth'] df['csr-legacy-light-without-atomic-bandwidth' ] = df[ 'CSR Legacy LightWithoutAtomic','GPU','bandwidth'] df['csr-legacy-scalar-bandwidth' ] = df[ 'CSR Legacy Scalar','GPU','bandwidth'] df['csr-legacy-vector-bandwidth' ] = df[ 'CSR Legacy Vector','GPU','bandwidth'] @@ -214,19 +214,19 @@ for x in cuSparse_list: cuSparse_file.write( f"{i+1} {x} " ) # 1 2 cuSparse_file.write( f"{cuSparse_csr_legacy_adaptive_gpu_list[ i ]} " ) # 3 cuSparse_file.write( f"{cuSparse_csr_legacy_light_gpu_list[ i ]} " ) # 4 - cuSparse_file.write( f"{cuSparse_csr_legacy_light2_gpu_list[ i ]} " ) # 4 - cuSparse_file.write( f"{cuSparse_csr_legacy_light3_gpu_list[ i ]} " ) # 4 - cuSparse_file.write( f"{cuSparse_csr_legacy_light4_gpu_list[ i ]} " ) # 4 - cuSparse_file.write( f"{cuSparse_csr_legacy_light5_gpu_list[ i ]} " ) # 4 - cuSparse_file.write( f"{cuSparse_csr_legacy_light6_gpu_list[ i ]} " ) # 4 - cuSparse_file.write( f"{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " ) # 5 - cuSparse_file.write( f"{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " ) # 6 - cuSparse_file.write( f"{cuSparse_csr_legacy_vector_gpu_list[ i ]} " ) # 7 - cuSparse_file.write( f"{cuSparse_csr_legacy_multivector_gpu_list[ i ]} " ) # 8 - cuSparse_file.write( f"{cuSparse_ellpack_gpu_list[ i ]} {cuSparse_ellpack_legacy_gpu_list[ i ]} " ) # 9 10 - cuSparse_file.write( f"{cuSparse_sliced_ellpack_gpu_list[ i ]} {cuSparse_sliced_ellpack_legacy_gpu_list[ i ]} " ) # 11 12 - cuSparse_file.write( f"{cuSparse_chunked_ellpack_gpu_list[ i ]} {cuSparse_chunked_ellpack_legacy_gpu_list[ i ]} " ) # 13 14 - cuSparse_file.write( f"{cuSparse_bi_ellpack_gpu_list[ i ]} {cuSparse_bi_ellpack_legacy_gpu_list[ i ]}\n" ) # 15 16 + cuSparse_file.write( f"{cuSparse_csr_legacy_light2_gpu_list[ i ]} " ) # 5 + cuSparse_file.write( f"{cuSparse_csr_legacy_light3_gpu_list[ i ]} " ) # 6 + cuSparse_file.write( f"{cuSparse_csr_legacy_light4_gpu_list[ i ]} " ) # 7 + cuSparse_file.write( f"{cuSparse_csr_legacy_light5_gpu_list[ i ]} " ) # 8 + cuSparse_file.write( f"{cuSparse_csr_legacy_light6_gpu_list[ i ]} " ) # 9 + cuSparse_file.write( f"{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " ) # 10 + cuSparse_file.write( f"{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " ) # 11 + cuSparse_file.write( f"{cuSparse_csr_legacy_vector_gpu_list[ i ]} " ) # 12 + cuSparse_file.write( f"{cuSparse_csr_legacy_multivector_gpu_list[ i ]} " ) # 13 + cuSparse_file.write( f"{cuSparse_ellpack_gpu_list[ i ]} {cuSparse_ellpack_legacy_gpu_list[ i ]} " ) # 14 15 + cuSparse_file.write( f"{cuSparse_sliced_ellpack_gpu_list[ i ]} {cuSparse_sliced_ellpack_legacy_gpu_list[ i ]} " ) # 16 17 + cuSparse_file.write( f"{cuSparse_chunked_ellpack_gpu_list[ i ]} {cuSparse_chunked_ellpack_legacy_gpu_list[ i ]} " ) # 18 19 + cuSparse_file.write( f"{cuSparse_bi_ellpack_gpu_list[ i ]} {cuSparse_bi_ellpack_legacy_gpu_list[ i ]}\n" ) # 20 21 i = i + 1 cuSparse_file.close() @@ -289,76 +289,76 @@ plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', set output 'csr-legacy-light2-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:4 title 'CSR Legacy Light2' with lines linewidth 0.5 lt rgb 'green', + 'cusparse.gplt' using 1:5 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:5 title 'CSR Legacy Light2' with lines linewidth 0.5 lt rgb 'green', set output 'csr-legacy-light3-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:4 title 'CSR Legacy Light3' with lines linewidth 0.5 lt rgb 'green', + 'cusparse.gplt' using 1:6 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:6 title 'CSR Legacy Light3' with lines linewidth 0.5 lt rgb 'green', set output 'csr-legacy-light4-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:4 title 'CSR Legacy Light4' with lines linewidth 0.5 lt rgb 'green', + 'cusparse.gplt' using 1:7 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:7 title 'CSR Legacy Light4' with lines linewidth 0.5 lt rgb 'green', set output 'csr-legacy-light5-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:4 title 'CSR Legacy Light5' with lines linewidth 0.5 lt rgb 'green', + 'cusparse.gplt' using 1:8 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:8 title 'CSR Legacy Light5' with lines linewidth 0.5 lt rgb 'green', set output 'csr-legacy-light6-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:4 title 'CSR Legacy Light6' with lines linewidth 0.5 lt rgb 'green', + 'cusparse.gplt' using 1:9 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:9 title 'CSR Legacy Light6' with lines linewidth 0.5 lt rgb 'green', set output 'csr-legacy-light-without-atomic-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:5 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:5 title 'CSR Legacy LightWithoutAtomic' with lines linewidth 0.5 lt rgb 'green', + 'cusparse.gplt' using 1:10 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:10 title 'CSR Legacy LightWithoutAtomic' with lines linewidth 0.5 lt rgb 'green', set output 'csr-legacy-scalar-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:6 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:6 title 'CSR Legacy Scalar' with lines linewidth 0.5 lt rgb 'green', + 'cusparse.gplt' using 1:11 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:11 title 'CSR Legacy Scalar' with lines linewidth 0.5 lt rgb 'green', set output 'csr-legacy-vector-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:7 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:7 title 'CSR Legacy Vector' with lines linewidth 0.5 lt rgb 'green', + 'cusparse.gplt' using 1:12 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:12 title 'CSR Legacy Vector' with lines linewidth 0.5 lt rgb 'green', set output 'csr-legacy-multivector-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:8 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:8 title 'CSR Legacy MultiVector' with lines linewidth 0.5 lt rgb 'green', + 'cusparse.gplt' using 1:13 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:13 title 'CSR Legacy MultiVector' with lines linewidth 0.5 lt rgb 'green', set output 'ellpack-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:9 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:9 title 'Ellpack' with lines linewidth 0.5 lt rgb 'green', \ - 'cusparse.gplt' using 1:10 title '' with dots linewidth 2 lt rgb 'blue', \ - 'cusparse.gplt' using 1:10 title 'Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' + 'cusparse.gplt' using 1:14 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:14 title 'Ellpack' with lines linewidth 0.5 lt rgb 'green', \ + 'cusparse.gplt' using 1:15 title '' with dots linewidth 2 lt rgb 'blue', \ + 'cusparse.gplt' using 1:15 title 'Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' set output 'sliced-ellpack-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:11 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:11 title 'Sliced Ellpack' with lines linewidth 0.5 lt rgb 'green', \ - 'cusparse.gplt' using 1:12 title '' with dots linewidth 2 lt rgb 'blue', \ - 'cusparse.gplt' using 1:12 title 'Sliced Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' + 'cusparse.gplt' using 1:16 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:16 title 'Sliced Ellpack' with lines linewidth 0.5 lt rgb 'green', \ + 'cusparse.gplt' using 1:17 title '' with dots linewidth 2 lt rgb 'blue', \ + 'cusparse.gplt' using 1:17 title 'Sliced Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' set output 'chunked-ellpack-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:13 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:13 title 'Chunked Ellpack' with lines linewidth 0.5 lt rgb 'green', \ - 'cusparse.gplt' using 1:14 title '' with dots linewidth 2 lt rgb 'blue', \ - 'cusparse.gplt' using 1:14 title 'Chunked Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' + 'cusparse.gplt' using 1:18 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:18 title 'Chunked Ellpack' with lines linewidth 0.5 lt rgb 'green', \ + 'cusparse.gplt' using 1:19 title '' with dots linewidth 2 lt rgb 'blue', \ + 'cusparse.gplt' using 1:19 title 'Chunked Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' set output 'bi-ellpack-vs-cusparse.eps' plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:15 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:15 title 'BiEllpack' with lines linewidth 0.5 lt rgb 'green', \ - 'cusparse.gplt' using 1:16 title '' with dots linewidth 2 lt rgb 'blue', \ - 'cusparse.gplt' using 1:16 title 'BiEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue' + 'cusparse.gplt' using 1:20 title '' with dots linewidth 2 lt rgb 'green', \ + 'cusparse.gplt' using 1:20 title 'BiEllpack' with lines linewidth 0.5 lt rgb 'green', \ + 'cusparse.gplt' using 1:21 title '' with dots linewidth 2 lt rgb 'blue', \ + 'cusparse.gplt' using 1:21 title 'BiEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue' set output 'ellpack-vs-ellpack-legacy.eps' plot 'ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'ellpack.gplt' using 1:2 title 'Ellpack' with lines linewidth 0.5 lt rgb 'red', \ -- GitLab From f49f946dda8058cf9666aaff3af77feeb898974f Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Tue, 14 Jul 2020 22:45:21 +0200 Subject: [PATCH 44/57] Fixes for CSR Adaptive --- src/TNL/Matrices/Legacy/CSR_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index efde670d9..387b50fc7 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -166,7 +166,7 @@ void CSR< Real, Device, Index, KernelType >::setBlocks() start, *this, rows, type, sum ); if (type == Type::LONG) { - Index parts = roundUpDivision(sum, 384); + Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); for (Index index = 0; index < parts; ++index) { inBlock.emplace_back(start, Type::LONG, index); } -- GitLab From d1c5eecbd3f3fac5a52307c17a6146c5b7216962 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Wed, 15 Jul 2020 01:23:24 +0200 Subject: [PATCH 45/57] Fixed uninitialized variable --- src/TNL/Matrices/Legacy/CSR.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 52528b0fd..b1d9c68bb 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -37,6 +37,7 @@ union Block { Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { this->index[0] = row; + this->index[1] = 0; this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; if (type == Type::STREAM) -- GitLab From 382eb38ffb40bcf6d23b2df44f9ddd220c025b73 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Tue, 21 Jul 2020 00:46:47 +0200 Subject: [PATCH 46/57] Fix for CSR Adaptive --- src/TNL/Matrices/Legacy/CSR.h | 18 ++++++++-------- src/TNL/Matrices/Legacy/CSR_impl.h | 34 +++++++++++++++--------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index b1d9c68bb..26a1c17bb 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -107,28 +107,28 @@ public: Containers::Vector< Block, Device, Index > blocks; - /* Configuration of SpMV kernels ------------------------------------------- */ + /* Configuration of CSR SpMV kernels ----------------------------------------- */ /* Block sizes */ - - // Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache - // 512 threads per block for double (12 elements per thread) - static constexpr Index THREADS_ADAPTIVE = sizeof(Real) == 4 ? 1024 : 512; + static constexpr Index THREADS_ADAPTIVE = 1024; static constexpr Index THREADS_SCALAR = 1024; static constexpr Index THREADS_VECTOR = 1024; static constexpr Index THREADS_LIGHT = 1024; - + /* Max length of row to process one warp */ static constexpr Index MAX_ELEMENTS_PER_WARP = 1024; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 49152; - + /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); - + + /* Number of warps in block for CSR Adaptive */ + static constexpr Index WARPS = THREADS_ADAPTIVE / 32; + /* Number of elements in shared memory per one warp */ - static constexpr Index SHARED_PER_WARP = SHARED / (THREADS_ADAPTIVE / 32); + static constexpr Index SHARED_PER_WARP = SHARED / WARPS; /* -------------------------------------------------------------------------- */ diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 387b50fc7..b5a05cc5e 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -130,7 +130,7 @@ Index findLimit(const Index start, matrix.getRowPointers().getElement(current); sum += elements; if (sum > matrix.SHARED_PER_WARP) { - if (current - start > 1) { // extra row + if (current - start > 0) { // extra row type = Type::STREAM; return current; } else { // one long row @@ -804,7 +804,7 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const template< typename Real, typename Index, int warpSize, - int SHARED, + int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP > __global__ @@ -816,7 +816,7 @@ void SpMVCSRAdaptive( const Real *inVector, const Block *blocks, Index blocksSize, Index gridID) { - __shared__ Real shared[SHARED]; + __shared__ Real shared[WARPS][SHARED_PER_WARP]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if (blockIdx >= blocksSize) @@ -826,25 +826,25 @@ void SpMVCSRAdaptive( const Real *inVector, const Index laneID = threadIdx.x & 31; // & is cheaper than % Block block = blocks[blockIdx]; const Index minID = rowPointers[block.index[0]/* minRow */]; - Index i, to, offset, maxID; + Index i, to, maxID; if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { /////////////////////////////////////* CSR STREAM *////////////// - const Index maxRow = block.index[0]/* minRow */ + - /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); + const Index warpID = threadIdx.x / 32; maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; - /* offset between shared and global addresses */ - offset = minID - (threadIdx.x / warpSize * SHARED_PER_WARP); - /* Copy and calculate elements from global to shared memory, coalesced */ + + /* Stream data to shared memory */ for (i = laneID + minID; i < maxID; i += warpSize) - shared[i - offset] = values[i] * inVector[columnIndexes[i]]; + shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]]; + const Index maxRow = block.index[0]/* minRow */ + + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); /* Calculate result */ for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { - to = rowPointers[i + 1] - offset; // end of preprocessed data + to = rowPointers[i + 1] - minID; // end of preprocessed data result = 0; /* Scalar reduction */ - for (Index sharedID = rowPointers[i] - offset; sharedID < to; ++sharedID) - result += shared[sharedID]; + for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID) + result += shared[warpID][sharedID]; outVector[i] = result; // Write result } @@ -864,10 +864,10 @@ void SpMVCSRAdaptive( const Real *inVector, if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result } else { /////////////////////////////////////* CSR VECTOR L *///////////// - maxID = rowPointers[block.index[0]/* minRow */ + 1]; - - offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; + /* Number of elements processed by previous warps */ + const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP; + maxID = rowPointers[block.index[0]/* minRow */ + 1]; if (to > maxID) to = maxID; for (i = minID + offset + laneID; i < to; i += warpSize) result += values[i] * inVector[columnIndexes[i]]; @@ -1754,7 +1754,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, } SpMVCSRAdaptive< Real, Index, warpSize, - matrix.SHARED, + matrix.WARPS, matrix.SHARED_PER_WARP, matrix.MAX_ELEMENTS_PER_WARP > <<>>( -- GitLab From 4196f9150948ac7e896498531e9fee19ebb469a4 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sun, 26 Jul 2020 12:57:35 +0200 Subject: [PATCH 47/57] Fixed block sizes for CSR Light, other improvements --- src/TNL/Matrices/Legacy/CSR.h | 12 ++++++------ src/TNL/Matrices/Legacy/CSR_impl.h | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 26a1c17bb..6b5664363 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -110,16 +110,16 @@ public: /* Configuration of CSR SpMV kernels ----------------------------------------- */ /* Block sizes */ - static constexpr Index THREADS_ADAPTIVE = 1024; - static constexpr Index THREADS_SCALAR = 1024; - static constexpr Index THREADS_VECTOR = 1024; - static constexpr Index THREADS_LIGHT = 1024; + static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; + static constexpr Index THREADS_SCALAR = 128; + static constexpr Index THREADS_VECTOR = 128; + static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp */ - static constexpr Index MAX_ELEMENTS_PER_WARP = 1024; + static constexpr Index MAX_ELEMENTS_PER_WARP = 512; /* How many shared memory use per block in CSR Adaptive kernel */ - static constexpr Index SHARED_PER_BLOCK = 49152; + static constexpr Index SHARED_PER_BLOCK = 24576; /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index b5a05cc5e..e03e4db6d 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -134,7 +134,7 @@ Index findLimit(const Index start, type = Type::STREAM; return current; } else { // one long row - if (sum <= matrix.MAX_ELEMENTS_PER_WARP) + if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP) type = Type::VECTOR; else type = Type::LONG; @@ -1407,7 +1407,7 @@ template< typename Real, void SpMVCSRLightPrepare( const Real *inVector, Real* outVector, const CSR< Real, Device, Index, KernelType >& matrix) { - const Index threads = matrix.THREADS_LIGHT; // max block size + const Index threads = 1024; // max block size const Index rows = matrix.getRowPointers().getSize() - 1; /* Copy rowCnt to GPU */ unsigned rowCnt = 0; @@ -1544,7 +1544,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, const Index threads = matrix.THREADS_LIGHT; // block size size_t neededThreads = rows * warpSize; Index blocks, groupSize; - + const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row if (nnz <= 2) groupSize = 2; @@ -1554,7 +1554,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, groupSize = 8; else if (nnz <= 16) groupSize = 16; - else if (nnz <= matrix.MAX_ELEMENTS_PER_WARP) + else if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP) groupSize = 32; // CSR Vector else groupSize = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector -- GitLab From dca1b1ca24be2e59f31676118e7ae292b1ce3e09 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Mon, 27 Jul 2020 11:51:59 +0200 Subject: [PATCH 48/57] Set max elements per warp --- src/TNL/Matrices/Legacy/CSR.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 6b5664363..7d4ebcb7c 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -116,7 +116,7 @@ public: static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp */ - static constexpr Index MAX_ELEMENTS_PER_WARP = 512; + static constexpr Index MAX_ELEMENTS_PER_WARP = 1024; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; -- GitLab From 62e73b0d4704634ed249ec9d164617c98053a52d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 4 Aug 2020 14:50:02 +0200 Subject: [PATCH 49/57] Optimizing slow Legacy CSR unit test. --- .../Matrices/Legacy/SparseMatrixTest.hpp | 86 +++++++++---------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp index 333b97371..9709dd895 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp @@ -1393,45 +1393,36 @@ void test_VectorProductCSRAdaptive() using IndexType = typename Matrix::IndexType; using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; - - Matrix m; - m.reset(); IndexType m_rows = 100; IndexType m_cols = 100; //----------------- Test CSR Stream part ------------------ + Matrix m; m.setDimensions( m_rows, m_cols ); - typename Matrix::CompressedRowLengthsVector rowLengths( - { - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 - } - ); - - m.setCompressedRowLengths( rowLengths ); - - for (int i = 0; i < m_rows; ++i) - for (int j = 0; j < m_cols; ++j) - m.setElement( i, j, i + 1 ); + typename Matrix::CompressedRowLengthsVector rowLengths( 100, 100 ); + if( std::is_same< DeviceType, TNL::Devices::Cuda >::value ) + { + typedef typename Matrix::Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType; + typename HostMatrixType::CompressedRowLengthsVector rowLengths( 100, 100 ); + HostMatrixType hostMatrix; + hostMatrix.setDimensions( m_rows, m_cols ); + hostMatrix.setCompressedRowLengths( rowLengths ); + for (int i = 0; i < m_rows; ++i) + for (int j = 0; j < m_cols; ++j) + hostMatrix.setElement( i, j, i + 1 ); + m = hostMatrix; + } + else + { + m.setCompressedRowLengths( rowLengths ); + for (int i = 0; i < m_rows; ++i) + for (int j = 0; j < m_cols; ++j) + m.setElement( i, j, i + 1 ); + } - VectorType inVector; - inVector.setSize( m_rows ); - for( IndexType i = 0; i < inVector.getSize(); ++i ) - inVector.setElement( i, 1 ); - VectorType outVector; - outVector.setSize( m_rows ); - for( IndexType i = 0; i < outVector.getSize(); ++i ) - outVector.setElement( i, 0 ); - + VectorType inVector( m_rows, 1.0 ); + VectorType outVector( m_rows, 0.0 ); m.vectorProduct( inVector, outVector); for (int i = 0; i < m_rows; ++i) @@ -1447,20 +1438,27 @@ void test_VectorProductCSRAdaptive() m.setDimensions( m_rows, m_cols ); typename Matrix::CompressedRowLengthsVector rowLengths2({m_cols}); - m.setCompressedRowLengths( rowLengths2 ); - - for (int i = 0; i < m_cols; ++i) - m.setElement( 0, i, i ); + if( std::is_same< DeviceType, TNL::Devices::Cuda >::value ) + { + typedef typename Matrix::Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType; + typename HostMatrixType::CompressedRowLengthsVector rowLengths( {m_cols} ); + HostMatrixType hostMatrix; + hostMatrix.setDimensions( m_rows, m_cols ); + hostMatrix.setCompressedRowLengths( rowLengths ); + for( int i = 0; i < m_cols; ++i ) + hostMatrix.setElement( 0, i, i ); + m = hostMatrix; + } + else + { + m.setCompressedRowLengths( rowLengths2 ); + for (int i = 0; i < m_cols; ++i) + m.setElement( 0, i, i ); + } - VectorType inVector2; - inVector2.setSize( m_cols ); - for( IndexType i = 0; i < inVector2.getSize(); i++ ) - inVector2.setElement( i, 2 ); + VectorType inVector2( m_cols, 2.0 ); - VectorType outVector2; - outVector2.setSize( m_rows ); - for( IndexType i = 0; i < outVector2.getSize(); ++i ) - outVector2.setElement( i, 0 ); + VectorType outVector2( m_rows, 0.0 ); m.vectorProduct(inVector2, outVector2); EXPECT_EQ( outVector2.getElement( 0 ), 8997000 ); -- GitLab From 71441737fb2bf50ca5584039a3201b7800d4ab78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 4 Aug 2020 15:48:33 +0200 Subject: [PATCH 50/57] Added unit tests for all CSR legacy formats. --- src/TNL/Matrices/Legacy/CSR.h | 2 +- .../Matrices/Legacy/SparseMatrixTest_CSR.h | 64 ++++++++++++++----- 2 files changed, 49 insertions(+), 17 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 7d4ebcb7c..818e51883 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -68,7 +68,7 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; -enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, +enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, // Hybrid is not implemented CSRLight, CSRLight2, CSRLight3, CSRLight4, CSRLight5, CSRLight6, CSRAdaptive, CSRMultiVector, CSRLightWithoutAtomic }; diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h index 0cf205929..4b9325e06 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h @@ -27,23 +27,55 @@ protected: // types for which MatrixTest is instantiated using CSRMatrixTypes = ::testing::Types < - TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, int >, - TNL::Matrices::Legacy::CSR< long, TNL::Devices::Host, int >, - // TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, int >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >, - TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, long >, - TNL::Matrices::Legacy::CSR< long, TNL::Devices::Host, long >, - // TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, long >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long > + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, int, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, int, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, long, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, long, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long, TNL::Matrices::Legacy::CSRScalar > #ifdef HAVE_CUDA - ,TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int >, - // TNL::Matrices::Legacy::CSR< long, TNL::Devices::Cuda, int >, // cuda atomicAdd has no support for long, only unsigned long long int - TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >, - TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long >, - // TNL::Matrices::Legacy::CSR< long, TNL::Devices::Cuda, long >, - TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long > + ,TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRVector >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRVector >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRVector >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRVector >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRVector >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRVector >, + /*TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRHybrid >, // Not implemented + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRHybrid >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRHybrid >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRHybrid >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRHybrid >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRHybrid >,*/ + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRLight >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRLight >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRLight >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLight >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLight >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLight >, + /*TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRAdaptive >, // Does not work, needs to be fixed. + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRAdaptive >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRAdaptive >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRAdaptive >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRAdaptive >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRAdaptive >,*/ + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRMultiVector >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRMultiVector >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRMultiVector >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRMultiVector >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRMultiVector >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRMultiVector >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRLightWithoutAtomic >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRLightWithoutAtomic >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRLightWithoutAtomic >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLightWithoutAtomic >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLightWithoutAtomic >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLightWithoutAtomic > #endif >; -- GitLab From 3c030845f3ade77046e70518fd62badaabbb4895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 5 Aug 2020 12:17:56 +0200 Subject: [PATCH 51/57] Refactoring Python script for SpMV benchmark results processing. --- .../scripts/tnl-spmv-benchmark-make-tables.py | 289 +++++------------- 1 file changed, 68 insertions(+), 221 deletions(-) diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py index 1e897d6aa..3d5ce16be 100755 --- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py +++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py @@ -5,13 +5,36 @@ import re import math import pandas +from collections import defaultdict from TNL.LogParser import LogParser +""" +Sparse matrix formats as they appear in the log file. +""" +cpu_matrix_formats = [ 'CSR', + 'Ellpack', 'Ellpack Legacy', + 'SlicedEllpack', 'SlicedEllpack Legacy', + 'ChunkedEllpack', 'ChunkedEllpack Legacy', + 'BiEllpack', 'BiEllpack Legacy' ] + +gpu_matrix_formats = [ 'CSR Legacy Scalar', 'CSR Legacy Vector', 'CSR Legacy MultiVector', + 'CSR Legacy Light', 'CSR Legacy Light2', 'CSR Legacy Light3', 'CSR Legacy Light4', 'CSR Legacy Light5', 'CSR Legacy Light6', 'CSR Legacy LightWithoutAtomic', + 'CSR Legacy Adaptive', + 'Ellpack', 'Ellpack Legacy', + 'SlicedEllpack', 'SlicedEllpack Legacy', + 'ChunkedEllpack', 'ChunkedEllpack Legacy', + 'BiEllpack', 'BiEllpack Legacy' ] + #pandas.options.display.float_format = "{:.2f}".format pandas.options.display.float_format = "{:.2e}".format pandas.options.display.width = 0 # auto-detect terminal width for formatting pandas.options.display.max_rows = None +def slugify(s): + s = str(s).strip().replace(' ', '_') + return re.sub(r'(?u)[^-\w.]', '', s) + + def parse_file(fname): parser = LogParser() for metadata, df in parser.readFile(fname): @@ -59,20 +82,8 @@ df = df.reorder_levels([2, 0, 1], axis=1) df.sort_index(axis=1, inplace=True) # Drop CPU speedup -df.drop(columns=('BiEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('BiEllpack', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('CSR', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Scalar', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Stream', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('ChunkedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('Ellpack', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('Ellpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('SlicedEllpack', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('cuSparse', 'CPU'), axis=1, inplace=True ) +for cpu_format in cpu_matrix_formats: + df.drop(columns=( cpu_format, 'CPU','speedup'), axis=1, inplace=True ) #print( "Exporting data frame to log.html..." ) #pandas.options.display.float_format = '{:,.4f}'.format @@ -80,32 +91,12 @@ df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True print( "Computing speed-up of formats...") # Add speedup compared to CSR and cuSparse -df["BiEllpack Legacy", "CPU", "CSR speedup"] = df["BiEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["BiEllpack Legacy", "GPU", "cuSparse speedup"] = df["BiEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["BiEllpack", "CPU", "CSR speedup"] = df["BiEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] -df["BiEllpacky", "GPU", "cuSparse speedup"] = df["BiEllpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR", "GPU", "cuSparse speedup"] = df["CSR", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Adaptive", "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Light", "GPU", "cuSparse speedup"] = df["CSR Legacy Light", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Light2", "GPU", "cuSparse speedup"] = df["CSR Legacy Light2", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Light3", "GPU", "cuSparse speedup"] = df["CSR Legacy Light3", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Light4", "GPU", "cuSparse speedup"] = df["CSR Legacy Light4", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Light5", "GPU", "cuSparse speedup"] = df["CSR Legacy Light5", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Light6", "GPU", "cuSparse speedup"] = df["CSR Legacy Light6", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy LightWithoutAtomic", "GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Scalar", "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy Vector", "GPU", "cuSparse speedup"] = df["CSR Legacy Vector", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR Legacy MultiVector", "GPU", "cuSparse speedup"] = df["CSR Legacy MultiVector", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["ChunkedEllpack Legacy", "CPU", "CSR speedup"] = df["ChunkedEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["ChunkedEllpack Legacy", "GPU", "cuSparse speedup"] = df["ChunkedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["Ellpack Legacy", "CPU", "CSR speedup"] = df["Ellpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["Ellpack Legacy", "GPU", "cuSparse speedup"] = df["Ellpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["Ellpack", "CPU", "CSR speedup"] = df["Ellpack", "CPU", "time"] / df["CSR", "CPU", "time"] -df["Ellpack", "GPU", "cuSparse speedup"] = df["Ellpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["SlicedEllpack Legacy", "CPU", "CSR speedup"] = df["SlicedEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["SlicedEllpack Legacy", "GPU", "cuSparse speedup"] = df["SlicedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["SlicedEllpack", "CPU", "CSR speedup"] = df["SlicedEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] -df["SlicedEllpack", "GPU", "cuSparse speedup"] = df["SlicedEllpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] +for cpu_format in cpu_matrix_formats: + if cpu_format != 'CSR': + df[cpu_format, "CPU", "CSR speedup"] = df[cpu_format, "CPU", "time"] / df["CSR","CPU", "time"] + +for gpu_format in gpu_matrix_formats: + df[ gpu_format, "GPU", "cuSparse speedup"] = df[ gpu_format,"GPU", "time"] / df["cuSparse", "GPU", "time"] # Add speedup compared to legacy formats df["CSR", "GPU", "Legacy speedup"] = df["CSR", "GPU", "time"] / df["CSR Legacy Scalar", "GPU", "time"] @@ -124,111 +115,50 @@ df.to_html("log.html") # extract columns of reference formats on GPU print( "Preparing data for graph analysis..." ) df['cuSparse-bandwidth' ] = df[ 'cuSparse','GPU','bandwidth'] -df['csr-legacy-adaptive-bandwidth' ] = df[ 'CSR Legacy Adaptive','GPU','bandwidth'] -df['csr-legacy-light-bandwidth' ] = df[ 'CSR Legacy Light','GPU','bandwidth'] -df['csr-legacy-light2-bandwidth' ] = df[ 'CSR Legacy Light2','GPU','bandwidth'] -df['csr-legacy-light3-bandwidth' ] = df[ 'CSR Legacy Light3','GPU','bandwidth'] -df['csr-legacy-light4-bandwidth' ] = df[ 'CSR Legacy Light4','GPU','bandwidth'] -df['csr-legacy-light5-bandwidth' ] = df[ 'CSR Legacy Light5','GPU','bandwidth'] -df['csr-legacy-light6-bandwidth' ] = df[ 'CSR Legacy Light6','GPU','bandwidth'] -df['csr-legacy-light-without-atomic-bandwidth' ] = df[ 'CSR Legacy LightWithoutAtomic','GPU','bandwidth'] -df['csr-legacy-scalar-bandwidth' ] = df[ 'CSR Legacy Scalar','GPU','bandwidth'] -df['csr-legacy-vector-bandwidth' ] = df[ 'CSR Legacy Vector','GPU','bandwidth'] -df['csr-legacy-multi-vector-bandwidth' ] = df[ 'CSR Legacy MultiVector','GPU','bandwidth'] -df['ellpack-bandwidth' ] = df[ 'Ellpack','GPU','bandwidth'] -df['sliced-ellpack-bandwidth' ] = df[ 'SlicedEllpack','GPU','bandwidth'] -df['chunked-ellpack-bandwidth' ] = df[ 'ChunkedEllpack','GPU','bandwidth'] -df['bi-ellpack-bandwidth' ] = df[ 'BiEllpack','GPU','bandwidth'] +for gpu_format in gpu_matrix_formats: + df[ gpu_format + ' Bandwidth' ] = df[ gpu_format,'GPU','bandwidth'] # sort by cuSparse df.sort_values(by=["cuSparse-bandwidth"],inplace=True,ascending=False) cuSparse_list = df['cuSparse-bandwidth'].tolist() -cuSparse_csr_legacy_adaptive_gpu_list = df[ "CSR Legacy Adaptive", "GPU", "bandwidth"].tolist(); -cuSparse_csr_legacy_light_gpu_list = df[ "CSR Legacy Light", "GPU", "bandwidth"].tolist(); -cuSparse_csr_legacy_light2_gpu_list = df[ "CSR Legacy Light2", "GPU", "bandwidth"].tolist(); -cuSparse_csr_legacy_light3_gpu_list = df[ "CSR Legacy Light3", "GPU", "bandwidth"].tolist(); -cuSparse_csr_legacy_light4_gpu_list = df[ "CSR Legacy Light4", "GPU", "bandwidth"].tolist(); -cuSparse_csr_legacy_light5_gpu_list = df[ "CSR Legacy Light5", "GPU", "bandwidth"].tolist(); -cuSparse_csr_legacy_light6_gpu_list = df[ "CSR Legacy Light6", "GPU", "bandwidth"].tolist(); -cuSparse_csr_legacy_light_without_atomic_gpu_list = df[ "CSR Legacy LightWithoutAtomic", "GPU", "bandwidth"].tolist(); -cuSparse_csr_legacy_scalar_gpu_list = df[ "CSR Legacy Scalar", "GPU", "bandwidth"].tolist(); -cuSparse_csr_legacy_vector_gpu_list = df[ "CSR Legacy Vector", "GPU", "bandwidth"].tolist(); -cuSparse_csr_legacy_multivector_gpu_list = df[ "CSR Legacy MultiVector", "GPU", "bandwidth"].tolist(); -cuSparse_ellpack_gpu_list = df[ "Ellpack", "GPU", "bandwidth"].tolist(); -cuSparse_ellpack_legacy_gpu_list = df[ "Ellpack Legacy", "GPU", "bandwidth"].tolist(); -cuSparse_sliced_ellpack_gpu_list = df[ "SlicedEllpack", "GPU", "bandwidth"].tolist(); -cuSparse_sliced_ellpack_legacy_gpu_list = df[ "SlicedEllpack Legacy", "GPU", "bandwidth"].tolist(); -cuSparse_chunked_ellpack_legacy_gpu_list = df[ "ChunkedEllpack Legacy", "GPU", "bandwidth"].tolist(); -cuSparse_chunked_ellpack_gpu_list = df[ "ChunkedEllpack", "GPU", "bandwidth"].tolist(); -cuSparse_bi_ellpack_legacy_gpu_list = df[ "BiEllpack Legacy", "GPU", "bandwidth"].tolist(); -cuSparse_bi_ellpack_gpu_list = df[ "BiEllpack", "GPU", "bandwidth"].tolist(); +cusparse_comparison = defaultdict( list ) +for gpu_format in gpu_matrix_formats: + cusparse_comparison[ gpu_format ] = df[ gpu_format, "GPU", "bandwidth" ].tolist() # sort by Ellpack -df.sort_values(by=["ellpack-bandwidth"],inplace=True,ascending=False) +df.sort_values(by=["Ellpack Bandwidth"],inplace=True,ascending=False) ellpack_gpu_list = df["Ellpack", "GPU", "bandwidth"].tolist(); ellpack_legacy_gpu_list = df["Ellpack Legacy", "GPU", "bandwidth"].tolist(); # sort by SlicedEllpack -df.sort_values(by=["sliced-ellpack-bandwidth"],inplace=True,ascending=False) -df.sort_values(by=["sliced-ellpack-bandwidth"],inplace=True,ascending=False) +df.sort_values(by=["SlicedEllpack Bandwidth"],inplace=True,ascending=False) sliced_ellpack_gpu_list = df["SlicedEllpack", "GPU", "bandwidth"].tolist(); sliced_ellpack_legacy_gpu_list = df["SlicedEllpack Legacy", "GPU", "bandwidth"].tolist(); # sort by ChunkedEllpack -df.sort_values(by=["chunked-ellpack-bandwidth"],inplace=True,ascending=False) -df.sort_values(by=["chunked-ellpack-bandwidth"],inplace=True,ascending=False) +df.sort_values(by=["ChunkedEllpack Bandwidth"],inplace=True,ascending=False) chunked_ellpack_gpu_list = df["ChunkedEllpack", "GPU", "bandwidth"].tolist(); chunked_ellpack_legacy_gpu_list = df["ChunkedEllpack Legacy", "GPU", "bandwidth"].tolist(); # sort by BiEllpack -df.sort_values(by=["bi-ellpack-bandwidth"],inplace=True,ascending=False) -df.sort_values(by=["bi-ellpack-bandwidth"],inplace=True,ascending=False) +df.sort_values(by=["BiEllpack Bandwidth"],inplace=True,ascending=False) bi_ellpack_gpu_list = df["BiEllpack", "GPU", "bandwidth"].tolist(); bi_ellpack_legacy_gpu_list = df["BiEllpack Legacy", "GPU", "bandwidth"].tolist(); print( "Writing gnuplot files..." ) -cuSparse_file = open( "cusparse.gplt", "w" ) -i = 0 -for x in cuSparse_list: - if str( x ) != "nan": - if ( str( cuSparse_csr_legacy_adaptive_gpu_list[ i ] ) != "nan" and - str( cuSparse_csr_legacy_light_gpu_list[ i ] ) != "nan" and - str( cuSparse_csr_legacy_light2_gpu_list[ i ] ) != "nan" and - str( cuSparse_csr_legacy_light3_gpu_list[ i ] ) != "nan" and - str( cuSparse_csr_legacy_light4_gpu_list[ i ] ) != "nan" and - str( cuSparse_csr_legacy_light5_gpu_list[ i ] ) != "nan" and - str( cuSparse_csr_legacy_light6_gpu_list[ i ] ) != "nan" and - str( cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ] ) != "nan" and - str( cuSparse_csr_legacy_scalar_gpu_list[ i ] ) != "nan" and - str( cuSparse_csr_legacy_vector_gpu_list[ i ] ) != "nan" and - str( cuSparse_csr_legacy_multivector_gpu_list[ i ] ) != "nan" and - str( cuSparse_ellpack_gpu_list[ i ] ) != "nan" and - str( cuSparse_ellpack_legacy_gpu_list[ i ] ) != "nan" and - str( cuSparse_sliced_ellpack_gpu_list[ i ] ) != "nan" and - str( cuSparse_sliced_ellpack_legacy_gpu_list[ i ] ) != "nan" and - str( cuSparse_chunked_ellpack_gpu_list[ i ] ) != "nan" and - str( cuSparse_chunked_ellpack_legacy_gpu_list[ i ] ) != "nan" and - str( cuSparse_bi_ellpack_gpu_list[ i ] ) != "nan" and - str( cuSparse_bi_ellpack_legacy_gpu_list[ i ] ) != "nan" ): - cuSparse_file.write( f"{i+1} {x} " ) # 1 2 - cuSparse_file.write( f"{cuSparse_csr_legacy_adaptive_gpu_list[ i ]} " ) # 3 - cuSparse_file.write( f"{cuSparse_csr_legacy_light_gpu_list[ i ]} " ) # 4 - cuSparse_file.write( f"{cuSparse_csr_legacy_light2_gpu_list[ i ]} " ) # 5 - cuSparse_file.write( f"{cuSparse_csr_legacy_light3_gpu_list[ i ]} " ) # 6 - cuSparse_file.write( f"{cuSparse_csr_legacy_light4_gpu_list[ i ]} " ) # 7 - cuSparse_file.write( f"{cuSparse_csr_legacy_light5_gpu_list[ i ]} " ) # 8 - cuSparse_file.write( f"{cuSparse_csr_legacy_light6_gpu_list[ i ]} " ) # 9 - cuSparse_file.write( f"{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " ) # 10 - cuSparse_file.write( f"{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " ) # 11 - cuSparse_file.write( f"{cuSparse_csr_legacy_vector_gpu_list[ i ]} " ) # 12 - cuSparse_file.write( f"{cuSparse_csr_legacy_multivector_gpu_list[ i ]} " ) # 13 - cuSparse_file.write( f"{cuSparse_ellpack_gpu_list[ i ]} {cuSparse_ellpack_legacy_gpu_list[ i ]} " ) # 14 15 - cuSparse_file.write( f"{cuSparse_sliced_ellpack_gpu_list[ i ]} {cuSparse_sliced_ellpack_legacy_gpu_list[ i ]} " ) # 16 17 - cuSparse_file.write( f"{cuSparse_chunked_ellpack_gpu_list[ i ]} {cuSparse_chunked_ellpack_legacy_gpu_list[ i ]} " ) # 18 19 - cuSparse_file.write( f"{cuSparse_bi_ellpack_gpu_list[ i ]} {cuSparse_bi_ellpack_legacy_gpu_list[ i ]}\n" ) # 20 21 - i = i + 1 -cuSparse_file.close() +for gpu_format in gpu_matrix_formats: + filename = "cusparse-" + slugify( gpu_format ) + ".gplt" + data = cusparse_comparison[ gpu_format ] + print( "Writing to " + filename + "..." ); + out_file = open( filename, "w" ) + i = 0 + for x in cuSparse_list: + if str( x ) != "nan": + if ( str(cusparse_comparison[ gpu_format ][ i ] ) != "nan" ): + out_file.write( f"{i+1} {x} {data[ i ]} \n" ) + i = i + 1; + out_file.close() ellpack_file = open( "ellpack.gplt", "w" ) i = 0; @@ -268,97 +198,25 @@ bi_ellpack_file.close() print( "Generating Gnuplot file..." ) + gnuplot_file = open( "gnuplot.gplt", "w" ) -# NOTE: """...""" allows multi-line strings, r"..." disables backslash-escaping (so a single \ is just a \ in the output) gnuplot_file.write( r""" set terminal postscript lw 3 20 color set grid set xlabel 'Matrix' set xtics 250 set ylabel 'Bandwidth GB/sec' -set output 'csr-legacy-adaptive-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:3 title 'CSR Legacy Adaptive' with lines linewidth 0.5 lt rgb 'green', -set output 'csr-legacy-light-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:4 title 'CSR Legacy Light' with lines linewidth 0.5 lt rgb 'green', -set output 'csr-legacy-light2-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:5 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:5 title 'CSR Legacy Light2' with lines linewidth 0.5 lt rgb 'green', -set output 'csr-legacy-light3-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:6 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:6 title 'CSR Legacy Light3' with lines linewidth 0.5 lt rgb 'green', -set output 'csr-legacy-light4-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:7 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:7 title 'CSR Legacy Light4' with lines linewidth 0.5 lt rgb 'green', -set output 'csr-legacy-light5-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:8 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:8 title 'CSR Legacy Light5' with lines linewidth 0.5 lt rgb 'green', -set output 'csr-legacy-light6-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:9 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:9 title 'CSR Legacy Light6' with lines linewidth 0.5 lt rgb 'green', -set output 'csr-legacy-light-without-atomic-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:10 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:10 title 'CSR Legacy LightWithoutAtomic' with lines linewidth 0.5 lt rgb 'green', -set output 'csr-legacy-scalar-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:11 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:11 title 'CSR Legacy Scalar' with lines linewidth 0.5 lt rgb 'green', -set output 'csr-legacy-vector-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:12 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:12 title 'CSR Legacy Vector' with lines linewidth 0.5 lt rgb 'green', -set output 'csr-legacy-multivector-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:13 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:13 title 'CSR Legacy MultiVector' with lines linewidth 0.5 lt rgb 'green', -set output 'ellpack-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:14 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:14 title 'Ellpack' with lines linewidth 0.5 lt rgb 'green', \ - 'cusparse.gplt' using 1:15 title '' with dots linewidth 2 lt rgb 'blue', \ - 'cusparse.gplt' using 1:15 title 'Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'sliced-ellpack-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:16 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:16 title 'Sliced Ellpack' with lines linewidth 0.5 lt rgb 'green', \ - 'cusparse.gplt' using 1:17 title '' with dots linewidth 2 lt rgb 'blue', \ - 'cusparse.gplt' using 1:17 title 'Sliced Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'chunked-ellpack-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:18 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:18 title 'Chunked Ellpack' with lines linewidth 0.5 lt rgb 'green', \ - 'cusparse.gplt' using 1:19 title '' with dots linewidth 2 lt rgb 'blue', \ - 'cusparse.gplt' using 1:19 title 'Chunked Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'bi-ellpack-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:20 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:20 title 'BiEllpack' with lines linewidth 0.5 lt rgb 'green', \ - 'cusparse.gplt' using 1:21 title '' with dots linewidth 2 lt rgb 'blue', \ - 'cusparse.gplt' using 1:21 title 'BiEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue' +""" ) +for gpu_format in gpu_matrix_formats: + filename = "cusparse-" + slugify( gpu_format ) + ".gplt" + gnuplot_file.write( f"set output 'cusparse-vs-{slugify(gpu_format)}.eps' \n" ) + gnuplot_file.write( f"plot '{filename}' using 1:2 title '' with dots linewidth 2 lt rgb 'red', " ) + gnuplot_file.write( f" '{filename}' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', " ) + gnuplot_file.write( f" '{filename}' using 1:3 title '' with dots linewidth 2 lt rgb 'green', " ) + gnuplot_file.write( f" '{filename}' using 1:3 title '{gpu_format}' with lines linewidth 0.5 lt rgb 'green' \n" ) + + +gnuplot_file.write( r""" set output 'ellpack-vs-ellpack-legacy.eps' plot 'ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ 'ellpack.gplt' using 1:2 title 'Ellpack' with lines linewidth 0.5 lt rgb 'red', \ @@ -386,21 +244,10 @@ print( "Executing Gnuplot ..." ) os.system( "gnuplot gnuplot.gplt" ) print( "Converting files to PDF ..." ) -os.system( "epstopdf --autorotate All csr-legacy-adaptive-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All csr-legacy-light-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All csr-legacy-light2-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All csr-legacy-light3-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All csr-legacy-light4-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All csr-legacy-light5-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All csr-legacy-light6-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All csr-legacy-light-without-atomic-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All csr-legacy-scalar-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All csr-legacy-vector-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All csr-legacy-multivector-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All ellpack-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All sliced-ellpack-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All chunked-ellpack-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All bi-ellpack-vs-cusparse.eps" ) +for gpu_format in gpu_matrix_formats: + filename = "cusparse-vs-" + slugify( gpu_format ) + ".eps" + os.system( f"epstopdf --autorotate All {filename}" ) + os.system( "epstopdf --autorotate All ellpack-vs-ellpack-legacy.eps" ) os.system( "epstopdf --autorotate All sliced-ellpack-vs-sliced-ellpack-legacy.eps" ) os.system( "epstopdf --autorotate All chunked-ellpack-vs-chunked-ellpack-legacy.eps" ) -- GitLab From 33e016ad9949f68d6a40fbb75f5bc114ecdf3a7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 5 Aug 2020 15:06:45 +0200 Subject: [PATCH 52/57] Additional refactoring of Python script for SpMV benchmark results processing. --- .../scripts/tnl-spmv-benchmark-make-tables.py | 197 ++++++++---------- 1 file changed, 92 insertions(+), 105 deletions(-) diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py index 3d5ce16be..2af4b9ffc 100755 --- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py +++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py @@ -24,15 +24,31 @@ gpu_matrix_formats = [ 'CSR Legacy Scalar', 'CSR Legacy Vector', 'CSR Legacy Mul 'SlicedEllpack', 'SlicedEllpack Legacy', 'ChunkedEllpack', 'ChunkedEllpack Legacy', 'BiEllpack', 'BiEllpack Legacy' ] +""" +CPU formats to be compared +""" +cpu_comparison_formats = { 'CSR' : 'CSR Legacy Scalar', + 'Ellpack' : 'Ellpack Legacy', + 'SlicedEllpack' : 'SlicedEllpack Legacy', + 'BiEllpack' : 'BiEllpack Legacy' + } +""" +GPU formats to be compared +""" +gpu_comparison_formats = { #'CSR' : 'CSR Legacy Scalar', + 'Ellpack' : 'Ellpack Legacy', + 'SlicedEllpack' : 'SlicedEllpack Legacy', + 'BiEllpack' : 'BiEllpack Legacy' + } #pandas.options.display.float_format = "{:.2f}".format pandas.options.display.float_format = "{:.2e}".format pandas.options.display.width = 0 # auto-detect terminal width for formatting pandas.options.display.max_rows = None def slugify(s): - s = str(s).strip().replace(' ', '_') - return re.sub(r'(?u)[^-\w.]', '', s) + s = str(s).strip().replace(' ', '_') + return re.sub(r'(?u)[^-\w.]', '', s) def parse_file(fname): @@ -99,58 +115,52 @@ for gpu_format in gpu_matrix_formats: df[ gpu_format, "GPU", "cuSparse speedup"] = df[ gpu_format,"GPU", "time"] / df["cuSparse", "GPU", "time"] # Add speedup compared to legacy formats -df["CSR", "GPU", "Legacy speedup"] = df["CSR", "GPU", "time"] / df["CSR Legacy Scalar", "GPU", "time"] -df["CSR", "CPU", "Legacy speedup"] = df["CSR", "CPU", "time"] / df["CSR Legacy Scalar", "CPU", "time"] -df["Ellpack", "GPU", "Legacy speedup"] = df["Ellpack", "GPU", "time"] / df["Ellpack Legacy", "GPU", "time"] -df["Ellpack", "CPU", "Legacy speedup"] = df["Ellpack", "CPU", "time"] / df["Ellpack Legacy", "CPU", "time"] -df["SlicedEllpack", "GPU", "Legacy speedup"] = df["SlicedEllpack", "GPU", "time"] / df["SlicedEllpack Legacy", "GPU", "time"] -df["SlicedEllpack", "CPU", "Legacy speedup"] = df["SlicedEllpack", "CPU", "time"] / df["SlicedEllpack Legacy", "CPU", "time"] -df["BiEllpack", "GPU", "Legacy speedup"] = df["BiEllpack", "GPU", "time"] / df["BiEllpack Legacy", "GPU", "time"] -df["BiEllpack", "CPU", "Legacy speedup"] = df["BiEllpack", "CPU", "time"] / df["BiEllpack Legacy", "CPU", "time"] +for format in cpu_comparison_formats: + other_format = cpu_comparison_formats[ format ] + df[ format, "CPU", f"{other_format} speedup"] = df[ format, "CPU", "time"] / df[ other_format, "CPU", "time"] + +for format in gpu_comparison_formats: + other_format = gpu_comparison_formats[ format ] + df[ format, "GPU", f"{other_format} speedup"] = df[ format, "GPU", "time"] / df[ other_format, "GPU", "time"] print( "Exporting data frame to log.html..." ) pandas.options.display.float_format = '{:,.4f}'.format df.to_html("log.html") -# extract columns of reference formats on GPU +""" +Extract columns of reference formats on GPU +""" print( "Preparing data for graph analysis..." ) df['cuSparse-bandwidth' ] = df[ 'cuSparse','GPU','bandwidth'] for gpu_format in gpu_matrix_formats: df[ gpu_format + ' Bandwidth' ] = df[ gpu_format,'GPU','bandwidth'] -# sort by cuSparse +""" +Sort by cuSparse +""" df.sort_values(by=["cuSparse-bandwidth"],inplace=True,ascending=False) cuSparse_list = df['cuSparse-bandwidth'].tolist() cusparse_comparison = defaultdict( list ) for gpu_format in gpu_matrix_formats: cusparse_comparison[ gpu_format ] = df[ gpu_format, "GPU", "bandwidth" ].tolist() -# sort by Ellpack -df.sort_values(by=["Ellpack Bandwidth"],inplace=True,ascending=False) -ellpack_gpu_list = df["Ellpack", "GPU", "bandwidth"].tolist(); -ellpack_legacy_gpu_list = df["Ellpack Legacy", "GPU", "bandwidth"].tolist(); - -# sort by SlicedEllpack -df.sort_values(by=["SlicedEllpack Bandwidth"],inplace=True,ascending=False) -sliced_ellpack_gpu_list = df["SlicedEllpack", "GPU", "bandwidth"].tolist(); -sliced_ellpack_legacy_gpu_list = df["SlicedEllpack Legacy", "GPU", "bandwidth"].tolist(); - -# sort by ChunkedEllpack -df.sort_values(by=["ChunkedEllpack Bandwidth"],inplace=True,ascending=False) -chunked_ellpack_gpu_list = df["ChunkedEllpack", "GPU", "bandwidth"].tolist(); -chunked_ellpack_legacy_gpu_list = df["ChunkedEllpack Legacy", "GPU", "bandwidth"].tolist(); - -# sort by BiEllpack -df.sort_values(by=["BiEllpack Bandwidth"],inplace=True,ascending=False) -bi_ellpack_gpu_list = df["BiEllpack", "GPU", "bandwidth"].tolist(); -bi_ellpack_legacy_gpu_list = df["BiEllpack Legacy", "GPU", "bandwidth"].tolist(); +""" +Sort by comparison formats +""" +formats_comparison = defaultdict( list ) +for format in gpu_comparison_formats: + df.sort_values(by=[f"{format} Bandwidth"],inplace=True,ascending=False) + formats_comparison[ format ] = df[format, "GPU", "bandwidth"].tolist(); + formats_comparison[ gpu_comparison_formats[ format ] ] = df[gpu_comparison_formats[ format ], "GPU", "bandwidth"].tolist(); +""" +Writting gnuplot source files +""" print( "Writing gnuplot files..." ) for gpu_format in gpu_matrix_formats: filename = "cusparse-" + slugify( gpu_format ) + ".gplt" data = cusparse_comparison[ gpu_format ] - print( "Writing to " + filename + "..." ); out_file = open( filename, "w" ) i = 0 for x in cuSparse_list: @@ -160,44 +170,22 @@ for gpu_format in gpu_matrix_formats: i = i + 1; out_file.close() -ellpack_file = open( "ellpack.gplt", "w" ) -i = 0; -for x in ellpack_gpu_list: - if str( x ) != "nan": - if str( ellpack_legacy_gpu_list[ i ] ) != "nan": - ellpack_file.write( f"{i+1} {x} {ellpack_legacy_gpu_list[ i ]}\n" ) - i = i + 1 -ellpack_file.close() - -sliced_ellpack_file = open( "sliced-ellpack.gplt", "w" ) -i = 0; -for x in sliced_ellpack_gpu_list: - if str( x ) != "nan": - if str( sliced_ellpack_legacy_gpu_list[ i ] ) != "nan": - sliced_ellpack_file.write( f"{i+1} {x} {sliced_ellpack_legacy_gpu_list[ i ]}\n" ) - i = i + 1 -sliced_ellpack_file.close() - -chunked_ellpack_file = open( "chunked-ellpack.gplt", "w" ) -i = 0; -for x in chunked_ellpack_gpu_list: - if str( x ) != "nan": - if str( chunked_ellpack_legacy_gpu_list[ i ] ) != "nan": - chunked_ellpack_file.write( f"{i+1} {x} {chunked_ellpack_legacy_gpu_list[ i ]}\n" ) - i = i + 1 -chunked_ellpack_file.close() - -bi_ellpack_file = open( "bi-ellpack.gplt", "w" ) -i = 0; -for x in bi_ellpack_gpu_list: - if str( x ) != "nan": - if str( bi_ellpack_legacy_gpu_list[ i ] ) != "nan": - bi_ellpack_file.write( f"{i+1} {x} {bi_ellpack_legacy_gpu_list[ i ]}\n" ) - i = i + 1 -bi_ellpack_file.close() - -print( "Generating Gnuplot file..." ) +for format in gpu_comparison_formats: + out_file = open( f"{slugify(format)}-gpu-comparison.gplt", "w" ) + data = formats_comparison[ format ] + other_data = formats_comparison[ gpu_comparison_formats[ format ] ] + i = 0 + for x in data: + if str( x ) != "nan": + if str( other_data[ i ] ) != "nan": + out_file.write( f"{i+1} {x} {other_data[ i ]}\n" ) + i = i + 1 + out_file.close() +""" +Generating gnuplot script +""" +print( "Generating Gnuplot script..." ) gnuplot_file = open( "gnuplot.gplt", "w" ) gnuplot_file.write( r""" @@ -216,51 +204,50 @@ for gpu_format in gpu_matrix_formats: gnuplot_file.write( f" '{filename}' using 1:3 title '{gpu_format}' with lines linewidth 0.5 lt rgb 'green' \n" ) -gnuplot_file.write( r""" -set output 'ellpack-vs-ellpack-legacy.eps' -plot 'ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'ellpack.gplt' using 1:2 title 'Ellpack' with lines linewidth 0.5 lt rgb 'red', \ - 'ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue', \ - 'ellpack.gplt' using 1:3 title 'Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'sliced-ellpack-vs-sliced-ellpack-legacy.eps' -plot 'sliced-ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'sliced-ellpack.gplt' using 1:2 title 'SlicedEllpack' with lines linewidth 0.5 lt rgb 'red', \ - 'sliced-ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue', \ - 'sliced-ellpack.gplt' using 1:3 title 'SlicedEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'chunked-ellpack-vs-chunked-ellpack-legacy.eps' -plot 'chunked-ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'chunked-ellpack.gplt' using 1:2 title 'ChunkedEllpack' with lines linewidth 0.5 lt rgb 'red', \ - 'chunked-ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue', \ - 'chunked-ellpack.gplt' using 1:3 title 'ChunkedEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'bi-ellpack-vs-bi-ellpack-legacy.eps' -plot 'bi-ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'bi-ellpack.gplt' using 1:2 title 'BiEllpack' with lines linewidth 0.5 lt rgb 'red', \ - 'bi-ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue', \ - 'bi-ellpack.gplt' using 1:3 title 'BiEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -""") +for format in gpu_comparison_formats: + filename = f"{slugify(format)}-gpu-comparison.gplt" + data = formats_comparison[ format ] + other_data = formats_comparison[ gpu_comparison_formats[ format ] ] + gnuplot_file.write( f"set output '{slugify(format)}-vs-{slugify(gpu_comparison_formats[ format ])}.eps' \n" ) + gnuplot_file.write( f"plot '{filename}' using 1:2 title '' with dots linewidth 2 lt rgb 'red', " ) + gnuplot_file.write( f" '{filename}' using 1:2 title '{format}' with lines linewidth 0.5 lt rgb 'red'," ) + gnuplot_file.write( f" '{filename}' using 1:3 title '' with dots linewidth 2 lt rgb 'blue', " ) + gnuplot_file.write( f" '{filename}' using 1:3 title '{gpu_comparison_formats[ format ]}' with lines linewidth 0.5 lt rgb 'blue' \n" ) + gnuplot_file.close() +""" +Executing Gnuplot +""" + print( "Executing Gnuplot ..." ) os.system( "gnuplot gnuplot.gplt" ) +""" +Converting files to PDF +""" print( "Converting files to PDF ..." ) for gpu_format in gpu_matrix_formats: filename = "cusparse-vs-" + slugify( gpu_format ) + ".eps" os.system( f"epstopdf --autorotate All {filename}" ) -os.system( "epstopdf --autorotate All ellpack-vs-ellpack-legacy.eps" ) -os.system( "epstopdf --autorotate All sliced-ellpack-vs-sliced-ellpack-legacy.eps" ) -os.system( "epstopdf --autorotate All chunked-ellpack-vs-chunked-ellpack-legacy.eps" ) -os.system( "epstopdf --autorotate All bi-ellpack-vs-bi-ellpack-legacy.eps" ) +for format in gpu_comparison_formats: + filename = slugify(format) + "-vs-" + slugify(gpu_comparison_formats[ format ]) + ".eps" + os.system( f"epstopdf --autorotate All {filename}" ) +""" +Deleting temporary files +""" print( "Deleting temprary files..." ) -#os.system( "rm cusparse.gplt" ) -#os.system( "rm ellpack.gplt" ) -#os.system( "rm sliced-ellpack.gplt" ) -#os.system( "rm gnuplot.gplt" ) -#os.system( "rm ellpack-vs-cusparse.eps" ) -#os.system( "rm sliced-ellpack-vs-cusparse.eps" ) -#os.system( "rm chunked-ellpack-vs-cusparse.eps" ) -#os.system( "rm bi-ellpack-vs-cusparse.eps" ) -#os.system( "rm ellpack-vs-ellpack-legacy.eps" ) -#os.system( "rm sliced-ellpack-vs-sliced-ellpack-legacy.eps" ) +for gpu_format in gpu_matrix_formats: + filename = "cusparse-" + slugify( gpu_format ) + ".gplt" + os.system( f"rm {filename}" ) + filename = "cusparse-vs-" + slugify( gpu_format ) + ".eps" + os.system( f"rm {filename}" ) + +for format in gpu_comparison_formats: + filename = f"{slugify(format)}-gpu-comparison.gplt" + os.system( f"rm {filename}" ) + filename = slugify(format) + "-vs-" + slugify(gpu_comparison_formats[ format ]) + ".eps" + os.system( f"rm {filename}" ) +os.system( "rm gnuplot.gplt" ) -- GitLab From 5606f2a0c17eae52d76eb5e0a88f990d83c2a06e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 5 Aug 2020 18:56:30 +0200 Subject: [PATCH 53/57] Fix for Clang compatibility. --- src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp index 9709dd895..df6f4441a 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp @@ -1402,7 +1402,7 @@ void test_VectorProductCSRAdaptive() if( std::is_same< DeviceType, TNL::Devices::Cuda >::value ) { - typedef typename Matrix::Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType; + typedef typename Matrix::template Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType; typename HostMatrixType::CompressedRowLengthsVector rowLengths( 100, 100 ); HostMatrixType hostMatrix; hostMatrix.setDimensions( m_rows, m_cols ); @@ -1440,7 +1440,7 @@ void test_VectorProductCSRAdaptive() if( std::is_same< DeviceType, TNL::Devices::Cuda >::value ) { - typedef typename Matrix::Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType; + typedef typename Matrix::template Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType; typename HostMatrixType::CompressedRowLengthsVector rowLengths( {m_cols} ); HostMatrixType hostMatrix; hostMatrix.setDimensions( m_rows, m_cols ); -- GitLab From e884519ed65fb87207932fcf030792949d8dfdad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 5 Aug 2020 19:43:16 +0200 Subject: [PATCH 54/57] Deleted unused symmetric legacy sparse matrices. --- src/TNL/Matrices/Legacy/BiEllpackSymmetric.h | 184 -- .../Matrices/Legacy/BiEllpackSymmetric_impl.h | 1637 ----------------- src/TNL/Matrices/Legacy/EllpackSymmetric.h | 190 -- .../Matrices/Legacy/EllpackSymmetricGraph.h | 212 --- .../Legacy/EllpackSymmetricGraph_impl.h | 1044 ----------- .../Matrices/Legacy/EllpackSymmetric_impl.h | 833 --------- .../Matrices/Legacy/SlicedEllpackSymmetric.h | 210 --- .../Legacy/SlicedEllpackSymmetricGraph.h | 242 --- .../Legacy/SlicedEllpackSymmetricGraph_impl.h | 1316 ------------- .../Legacy/SlicedEllpackSymmetric_impl.h | 930 ---------- 10 files changed, 6798 deletions(-) delete mode 100644 src/TNL/Matrices/Legacy/BiEllpackSymmetric.h delete mode 100644 src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h delete mode 100644 src/TNL/Matrices/Legacy/EllpackSymmetric.h delete mode 100644 src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h delete mode 100644 src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h delete mode 100644 src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h delete mode 100644 src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h delete mode 100644 src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h delete mode 100644 src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h delete mode 100644 src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h diff --git a/src/TNL/Matrices/Legacy/BiEllpackSymmetric.h b/src/TNL/Matrices/Legacy/BiEllpackSymmetric.h deleted file mode 100644 index 09fe7c4e5..000000000 --- a/src/TNL/Matrices/Legacy/BiEllpackSymmetric.h +++ /dev/null @@ -1,184 +0,0 @@ -/*************************************************************************** - BiEllpackSymmetric.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Device > -class BiEllpackSymmetricDeviceDependentCode; - -template< typename Real, typename Device = Devices::Cuda, typename Index = int, int StripSize = 32 > -class BiEllpackSymmetric : public Sparse< Real, Device, Index > -{ -public: - typedef Real RealType; - typedef Device DeviceType; - typedef Index IndexType; - typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView; - typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; - - template< typename _Real = Real, - typename _Device = Device, - typename _Index = Index > - using Self = BiEllpackSymmetric< _Real, _Device, _Index >; - - BiEllpackSymmetric(); - - void setDimensions( const IndexType rows, const IndexType columns ); - - void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ); - - IndexType getRowLength( const IndexType row ) const; - - template< typename Real2, - typename Device2, - typename Index2 > - bool setLike( const BiEllpackSymmetric< Real2, Device2, Index2, StripSize >& matrix ); - - void getRowLengths( Containers::Vector< IndexType, DeviceType, IndexType >& rowLengths ) const; - - bool setElement( const IndexType row, - const IndexType column, - const RealType& value ); - - __cuda_callable__ - bool setElementFast( const IndexType row, - const IndexType column, - const RealType& value ); - - bool addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - bool addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - bool setRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements ); - - bool addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - RealType getElement( const IndexType row, - const IndexType column ) const; - - __cuda_callable__ - RealType getElementFast( const IndexType row, - const IndexType column ) const; - - void getRow( const IndexType row, - IndexType* columns, - RealType* values ) const; - - __cuda_callable__ - IndexType getGroupLength( const IndexType strip, - const IndexType group ) const; - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const; - - template< typename InVector, - typename OutVector > - void vectorProductHost( const InVector& inVector, - OutVector& outVector ) const; - - void setVirtualRows(const IndexType rows); - - __cuda_callable__ - IndexType getNumberOfGroups( const IndexType row ) const; - - bool vectorProductTest() const; - - void reset(); - - void save( File& file ) const; - - void load( File& file ); - - void save( const String& fileName ) const; - - void load( const String& fileName ); - - void print( std::ostream& str ) const; - - void performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths ); - void computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths ); - -// void verifyRowLengths( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ); - - template< typename InVector, - typename OutVector > -#ifdef HAVE_CUDA - __device__ -#endif - void spmvCuda( const InVector& inVector, - OutVector& outVector, - /*const IndexType warpStart, - const IndexType inWarpIdx*/ - int globalIdx ) const; - - __cuda_callable__ - IndexType getStripLength( const IndexType strip ) const; - - __cuda_callable__ - void performRowBubbleSortCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, - const IndexType strip ); - - __cuda_callable__ - void computeColumnSizesCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, - const IndexType numberOfStrips, - const IndexType strip ); - - __cuda_callable__ - IndexType power( const IndexType number, - const IndexType exponent ) const; - - typedef BiEllpackSymmetricDeviceDependentCode< DeviceType > DeviceDependentCode; - friend class BiEllpackSymmetricDeviceDependentCode< DeviceType >; - -private: - - IndexType warpSize; - - IndexType logWarpSize; - - IndexType virtualRows; - - Containers::Vector< Index, Device, Index > rowPermArray; - - Containers::Vector< Index, Device, Index > groupPointers; - -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL - -#include - diff --git a/src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h b/src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h deleted file mode 100644 index 61dde6334..000000000 --- a/src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h +++ /dev/null @@ -1,1637 +0,0 @@ -/*************************************************************************** - BiEllpackSymmetric.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Real, - typename Device, - typename Index, - int StripSize > - __cuda_callable__ -Index BiEllpackSymmetric< Real, Device, Index, StripSize >::power( const IndexType number, - const IndexType exponent ) const -{ - if( exponent >= 0 ) - { - IndexType result = 1; - for( IndexType i = 0; i < exponent; i++ ) - result *= number; - return result; - } - return 0; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -BiEllpackSymmetric< Real, Device, Index, StripSize >::BiEllpackSymmetric() -: warpSize( 32 ), - logWarpSize( 5 ) -{} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -String BiEllpackSymmetric< Real, Device, Index, StripSize >::getType() -{ - return String( "Matrices::BiEllpackMatrix< ") + - String( TNL::getType< Real >() ) + - String( ", " ) + - String( Device :: getDeviceType() ) + - String( ", " ) + - String( TNL::getType< Index >() ) + - String( " >" ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -String BiEllpackSymmetric< Real, Device, Index, StripSize >::getTypeVirtual() const -{ - return this->getType(); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::setDimensions( const IndexType rows, - const IndexType columns ) -{ - TNL_ASSERT( rows >= 0 && columns >= 0, - std::cerr << "rows = " << rows - << "columns = " << columns << std::endl ); - - if( this->getRows() % this->warpSize != 0 ) - this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) ); - else - this->setVirtualRows( this->getRows() ); - IndexType strips = this->virtualRows / this->warpSize; - - Sparse< Real, Device, Index >::setDimensions( rows, columns ); - this->rowPermArray.setSize( this->rows ); - this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 ); - - for( IndexType row = 0; row < this->getRows(); row++ ) - this->rowPermArray.setElement(row, row); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) -{ - if( this->getRows() % this->warpSize != 0 ) - this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) ); - else - this->setVirtualRows( this->getRows() ); - IndexType strips = this->virtualRows / this->warpSize; - this->rowPermArray.setSize( this->rows ); - this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 ); - for( IndexType i = 0; i < this->groupPointers.getSize(); i++ ) - this->groupPointers.setElement( i, 0 ); - - // FIXME: cannot sort a const vector! - //DeviceDependentCode::performRowBubbleSort( *this, rowLengths ); - //DeviceDependentCode::computeColumnSizes( *this, rowLengths ); - - this->groupPointers.computeExclusivePrefixSum(); - - // uncomment to perform structure test - //DeviceDependentCode::verifyRowPerm( *this, rowLengths ); - //DeviceDependentCode::verifyRowLengths( *this, rowLengths ); - - this->allocateMatrixElements( this->warpSize * this->groupPointers.getElement( strips * ( this->logWarpSize + 1 ) ) ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -__cuda_callable__ -Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const -{ - TNL_ASSERT( strip >= 0, - std::cerr << "strip = " << strip - << " this->getName() = " << std::endl ); - - return this->groupPointers.getElement( ( strip + 1 ) * ( this->logWarpSize + 1 ) ) - - this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -__cuda_callable__ -Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const -{ - TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << std::endl ); - - IndexType strip = row / this->warpSize; - IndexType rowStripPermutation = this->rowPermArray[ row ] - this->warpSize * strip; - IndexType numberOfGroups = this->logWarpSize + 1; - IndexType bisection = 1; - for( IndexType i = 0; i < this->logWarpSize + 1; i++ ) - { - if( rowStripPermutation < bisection ) - return ( numberOfGroups - i ); - bisection *= 2; - } - // FIXME: non-void function always has to return something sensible -#ifndef __CUDA_ARCH__ - throw "bug - row was not found"; -#else - TNL_ASSERT_TRUE( false, "bug - row was not found" ); -#endif -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getRowLength( const IndexType row ) const -{ - TNL_ASSERT( row >= 0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << std::endl ); - - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - IndexType rowLength = 0; - - for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ ) - { - for( IndexType i = 0; i < rowMultiplicator * this->getGroupLength( strip, group ); i++ ) - { - if( this->values.getElement( elementPtr ) == 0.0 ) - return rowLength; - else - rowLength++; - elementPtr += step; - } - rowMultiplicator *= 2; - step /= 2; - } - return rowLength; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setLike( const BiEllpackSymmetric< Real2, Device2, Index2, StripSize >& matrix ) -{ - std::cout << "setLike" << std::endl; - std::cout << "settingLike" << std::endl; - if( ! Sparse< Real, Device, Index >::setLike( matrix ) || - ! this->rowPermArray.setLike( matrix.rowPermArray ) || - ! this->groupPointers.setLike( matrix.groupPointers ) ) - return false; - return true; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::getRowLengths( Containers::Vector< IndexType, DeviceType, IndexType >& rowLengths) const -{ - for( IndexType row = 0; row < this->getRows(); row++ ) - rowLengths.setElement( row, this->getRowLength( row ) ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setElement( const IndexType row, - const IndexType column, - const RealType& value ) -{ - TNL_ASSERT( ( row >= 0 && row < this->getRows() ) || - ( column >= 0 && column < this->getColumns() ), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getColumns() = " << this->getColumns() - << " this->getName() = " << std::endl ); - - return this->addElement( row, column, value, 0.0 ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -__cuda_callable__ -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setElementFast( const IndexType row, - const IndexType column, - const RealType& value ) -{ - TNL_ASSERT( ( row >= 0 && row < this->getRows() ) || - ( column >= 0 && column < this->getColumns() ), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getColumns() = " << this->getColumns() - << " this->getName() = " << this->getName() <addElementFast( row, column, value, 0.0 ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - - for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ ) - { - for( IndexType i = 0; i < rowMultiplicator * this->getGroupLength( strip, group ); i++ ) - { - if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() ) - { - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; - } - if( this->columnIndexes.getElement( elementPtr ) == column ) - { - this->values.setElement( elementPtr, this->values.getElement( elementPtr ) + value * thisElementMultiplicator ); - return true; - } - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - return false; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -__cuda_callable__ -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray[ row ] - strip * this->warpSize; - IndexType elementPtr = this->groupPointers[ groupBegin ] * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - - IndexType numberOfGroups = this->logWarpSize + 1; - IndexType bisection = 1; - for( IndexType i = 0; i < this->logWarpSize + 1; i++ ) - { - if( rowStripPerm < bisection ) - { - numberOfGroups -= i; - break; - } - bisection *= 2; - } - - for( IndexType group = 0; group < numberOfGroups; group++ ) - { - IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - - for( IndexType i = 0; i < rowMultiplicator * groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] == this->getPaddingIndex() ) - { - this->columnIndexes[ elementPtr ] = column ; - this->values[ elementPtr ] = value; - return true; - } - if( this->columnIndexes[ elementPtr ] == column ) - { - this->values[ elementPtr ] += value * thisElementMultiplicator ; - return true; - } - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - return false; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements ) -{ - TNL_ASSERT( row >= 0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << std::endl ); - - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType thisElementPtr = 0; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - - for( IndexType group = 0; ( group < this->getNumberOfGroups( row ) ) && ( thisElementPtr < numberOfElements ); group++ ) - { - for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && ( thisElementPtr < numberOfElements ); i++ ) - { - this->columnIndexes.setElement( elementPtr, columns[ thisElementPtr ] ); - this->values.setElement( elementPtr, values[ thisElementPtr ] ); - thisElementPtr++; - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - if( thisElementPtr == numberOfElements ) - return true; - return false; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << std::endl ); - - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - this->warpSize * strip; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - IndexType thisElementPtr = 0; - - while( thisElementPtr < numberOfElements ) - { - for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ ) - { - for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && ( thisElementPtr < numberOfElements ); i++ ) - { - if( this->columnIndexes.getElement( elementPtr ) == columns[ thisElementPtr ] ) - { - RealType result = this->values.getElement( elementPtr ) + values[ thisElementPtr ] * thisElementMultiplicator; - this->values.setElement( elementPtr, result ); - thisElementPtr++; - } - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - } - return ( thisElementPtr == numberOfElements ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -Real BiEllpackSymmetric< Real, Device, Index, StripSize >::getElement( const IndexType row, - const IndexType column ) const -{ - TNL_ASSERT( ( row >= 0 && row < this->getRows() ) || - ( column >= 0 && column < this->getColumns() ), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getColumns() = " << this->getColumns() - << "this->getName() = " << std::endl ); - - if( row > column ) - return this->getElement( column, row ); - - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - - for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ ) - { - for( IndexType i = 0; i < rowMultiplicator * this->getGroupLength( strip, group ); i++ ) - { - if( this->columnIndexes.getElement( elementPtr ) == column ) - return this->values.getElement( elementPtr ); - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - return 0.0; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -__cuda_callable__ -Real BiEllpackSymmetric< Real, Device, Index, StripSize >::getElementFast( const IndexType row, - const IndexType column ) const -{ - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray[ row ] - strip * this->warpSize; - IndexType elementPtr = this->groupPointers[ groupBegin ] * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - - IndexType numberOfGroups = this->logWarpSize + 1; - IndexType bisection = 1; - for( IndexType i = 0; i < this->logWarpSize + 1; i++ ) - { - if( rowStripPerm < bisection ) - { - numberOfGroups -= i; - break; - } - bisection *= 2; - } - - for( IndexType group = 0; group < numberOfGroups; group++ ) - { - IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - - for( IndexType i = 0; i < rowMultiplicator * groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] == column ) - return this->values[ elementPtr ]; - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - return false; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::getRow( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << this->getName() <warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - this->warpSize * strip; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - IndexType thisElementPtr = 0; - - for( IndexType group = 0; group < this->getNumberOfGroups( row ) && !padding; group++ ) - { - for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && !padding; i++ ) - { - if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() ) - { - padding = true; - break; - } - values[ thisElementPtr ] = this->values.getElement( elementPtr ); - columns[ thisElementPtr ] = this->columnIndexes.getElement( elementPtr ); - thisElementPtr++; - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::setVirtualRows(const IndexType rows) -{ - this->virtualRows = rows; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -__cuda_callable__ -Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getGroupLength( const Index strip, - const Index group ) const -{ - return this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -template< typename InVector, - typename OutVector > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::vectorProduct( const InVector& inVector, - OutVector& outVector ) const -{ - DeviceDependentCode::vectorProduct( *this, inVector, outVector ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -template< typename InVector, - typename OutVector > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::vectorProductHost( const InVector& inVector, - OutVector& outVector ) const -{ - const IndexType cudaBlockSize = 256; - const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize ); - for( IndexType blockIdx = 0; blockIdx < cudaBlocks; blockIdx++ ) - { - Containers::Vector< Real, Device, Index > tempStripOutVector; - tempStripOutVector.setSize( cudaBlockSize ); - for( IndexType i = 0; i < tempStripOutVector.getSize(); i++ ) - tempStripOutVector.setElement( i, 0 ); - - for( IndexType threadIdx = 0; threadIdx < cudaBlockSize; threadIdx++ ) - { - IndexType globalIdx = cudaBlockSize * blockIdx + threadIdx; - IndexType warpStart = this->warpSize * ( globalIdx / this->warpSize ); - IndexType inWarpIdx = globalIdx % this->warpSize; - if( warpStart >= this->getRows() ) - break; - IndexType strip = warpStart / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - - IndexType row = warpStart + inWarpIdx; - IndexType currentRow = row; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + ( row - warpStart ); - IndexType bisection = this->warpSize; - for( IndexType group = 0; group < this->logWarpSize + 1; group++ ) - { - if( !( currentRow - warpStart < bisection ) ) - currentRow -= bisection; - IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() ) - { - elementPtr += this->warpSize; - continue; - } - RealType result = tempStripOutVector.getElement( currentRow % cudaBlockSize ); - result += inVector[ this->columnIndexes.getElement( elementPtr ) ] * this->values.getElement( elementPtr ); - outVector[ this->columnIndexes[ elementPtr ] ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - tempStripOutVector.setElement( currentRow % cudaBlockSize, result ); - elementPtr += this->warpSize; - } - bisection /= 2; - } - } - IndexType end = cudaBlockSize * ( blockIdx + 1 ); - if( end > this->getRows() ) - end = this->getRows(); - for( IndexType i = cudaBlockSize * blockIdx; i < end; i++ ) - outVector[ i ] = tempStripOutVector.getElement( this->rowPermArray.getElement( i ) % cudaBlockSize ); - tempStripOutVector.reset(); - } -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::reset() -{ - Sparse< Real, Device, Index >::reset(); - this->rowPermArray.reset(); - this->groupPointers.reset(); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::save( File& file ) const -{ - Sparse< Real, Device, Index >::save( file ); - file << this->groupPointers << this->rowPermArray; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::load( File& file ) -{ - Sparse< Real, Device, Index >::load( file ); - file >> this->groupPointers >> this->rowPermArray; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::save( const String& fileName ) const -{ - Object::save( fileName ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::load( const String& fileName ) -{ - Object::load( fileName ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::print( std::ostream& str ) const -{ - for( IndexType row = 0; row < this->getRows(); row++ ) - { - str <<"Row: " << row << " -> "; - bool padding = false; - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - this->warpSize * strip; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - - for( IndexType group = 0; group < this->getNumberOfGroups( row ) && !padding; group++ ) - { - for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && !padding; i++ ) - { - if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() ) - { - padding = true; - break; - } - RealType value = this->values.getElement( elementPtr ); - IndexType column = this->columnIndexes.getElement( elementPtr ); - str << " Col:" << column << "->" << value << "\t"; - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - str < -void BiEllpackSymmetric< Real, Device, Index, StripSize >::performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths ) -{ - Index strips = this->virtualRows / this->warpSize; - for( Index i = 0; i < strips; i++ ) - { - Index begin = i * this->warpSize; - Index end = ( i + 1 ) * this->warpSize - 1; - if( this->getRows() - 1 < end) - end = this->getRows() - 1; - bool sorted = false; - Index permIndex1, permIndex2, offset = 0; - while( !sorted ) - { - sorted = true; - for( Index j = begin + offset; j < end - offset; j++ ) - if( tempRowLengths.getElement( j ) < tempRowLengths.getElement( j + 1 ) ) - { - for( Index k = begin; k < end + 1; k++ ) - { - if( this->rowPermArray.getElement( k ) == j ) - permIndex1 = k; - if( this->rowPermArray.getElement( k ) == j + 1 ) - permIndex2 = k; - } - Index temp = tempRowLengths.getElement( j ); - tempRowLengths.setElement( j, tempRowLengths.getElement( j + 1 ) ); - tempRowLengths.setElement( j + 1, temp ); - temp = this->rowPermArray.getElement( permIndex1 ); - this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) ); - this->rowPermArray.setElement( permIndex2, temp ); - sorted = false; - } - for( Index j = end - 1 - offset; j > begin + offset; j-- ) - if( tempRowLengths.getElement( j ) > tempRowLengths.getElement( j - 1 ) ) - { - for( Index k = begin; k < end + 1; k++ ) - { - if( this->rowPermArray.getElement( k ) == j ) - permIndex1 = k; - if( this->rowPermArray.getElement( k ) == j - 1 ) - permIndex2 = k; - } - Index temp = tempRowLengths.getElement( j ); - tempRowLengths.setElement( j, tempRowLengths.getElement( j - 1 ) ); - tempRowLengths.setElement( j - 1, temp ); - temp = this->rowPermArray.getElement( permIndex1 ); - this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) ); - this->rowPermArray.setElement( permIndex2, temp ); - sorted = false; - } - offset++; - } - } -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths ) -{ - Index numberOfStrips = this->virtualRows / this->warpSize; - for( Index strip = 0; strip < numberOfStrips; strip++ ) - { - Index i = 0; - Index rowBegin = strip * this->warpSize; - Index groupBegin = strip * ( this->logWarpSize + 1 ); - Index emptyGroups = 0; - if( strip == numberOfStrips - 1 ) - { - Index lastRows = this->getRows() - rowBegin; - while( !( lastRows > this->power( 2, this->logWarpSize - 1 - emptyGroups ) ) ) - emptyGroups++; - for( Index group = groupBegin; group < groupBegin + emptyGroups; group++ ) - this->groupPointers.setElement( group, 0 ); - } - i += emptyGroups; - for( Index group = groupBegin + emptyGroups; group < groupBegin + this->logWarpSize; group++ ) - { - Index row = this->power( 2, 4 - i ); - Index temp = tempRowLengths.getElement( row + rowBegin ); - for( Index prevGroups = groupBegin; prevGroups < group; prevGroups++ ) - temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers.getElement( prevGroups ); - temp = ceil( ( float ) temp / this->power( 2, i ) ); - this->groupPointers.setElement( group, temp ); - i++; - } - Index temp = tempRowLengths.getElement( rowBegin ); - for( Index prevGroups = groupBegin; prevGroups < groupBegin + this->logWarpSize; prevGroups++ ) - temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers.getElement( prevGroups ); - temp = ceil( ( float ) temp / this->power( 2, this->logWarpSize ) ); - this->groupPointers.setElement( groupBegin + this->logWarpSize, temp ); - } -} - -template<> -class BiEllpackSymmetricDeviceDependentCode< Devices::Host > -{ -public: - - typedef Devices::Host Device; - - template< typename Real, - typename Index, - int StripSize > - static void verifyRowLengths( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { - bool ok = true; - for( Index row = 0; row < matrix.getRows(); row++ ) - { - const Index strip = row / matrix.warpSize; - const Index stripLength = matrix.getStripLength( strip ); - const Index groupBegin = ( matrix.logWarpSize + 1 ) * strip; - const Index rowStripPerm = matrix.rowPermArray.getElement( row ) - strip * matrix.warpSize; - const Index begin = matrix.groupPointers.getElement( groupBegin ) * matrix.warpSize + rowStripPerm * stripLength; - Index elementPtr = begin; - Index rowLength = 0; - for( Index group = 0; group < matrix.getNumberOfGroups( row ); group++ ) - { - for( Index i = 0; i < matrix.getGroupLength( strip, group ); i++ ) - { - Index biElementPtr = elementPtr; - for( Index j = 0; j < matrix.power( 2, group ); j++ ) - { - rowLength++; - biElementPtr += matrix.power( 2, matrix.logWarpSize - group ) * stripLength; - } - elementPtr++; - } - } - if( rowLengths.getElement( row ) > rowLength ) - ok = false; - } - if( ok ) - std::cout << "row lengths OK" < - static void verifyRowPerm( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { - bool ok = true; - Index numberOfStrips = matrix.virtualRows / matrix.warpSize; - for( Index strip = 0; strip < numberOfStrips; strip++ ) - { - Index begin = strip * matrix.warpSize; - Index end = ( strip + 1 ) * matrix.warpSize; - if( matrix.getRows() < end ) - end = matrix.getRows(); - for( Index i = begin; i < end - 1; i++ ) - { - Index permIndex1, permIndex2; - bool first = false; - bool second = false; - for( Index j = begin; j < end; j++ ) - { - if( matrix.rowPermArray.getElement( j ) == i ) - { - permIndex1 = j; - first = true; - } - if( matrix.rowPermArray.getElement( j ) == i + 1 ) - { - permIndex2 = j; - second = true; - } - } - if( !first || !second ) - std::cout << "Wrong permutation!" <= rowLengths.getElement( permIndex2 ) ) - continue; - else - ok = false; - } - } - if( ok ) - std::cout << "Permutation OK" < - static void vectorProduct( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const InVector& inVector, - OutVector& outVector ) - { - matrix.vectorProductHost( inVector, outVector ); - } - - template< typename Real, - typename Index, - int StripSize > - static void computeColumnSizes( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { - Index numberOfStrips = matrix.virtualRows / matrix.warpSize; - for( Index strip = 0; strip < numberOfStrips; strip++ ) - { - Index i = 0; - Index rowBegin = strip * matrix.warpSize; - Index groupBegin = strip * ( matrix.logWarpSize + 1 ); - Index emptyGroups = 0; - if( strip == numberOfStrips - 1 ) - { - Index lastRows = matrix.getRows() - rowBegin; - while( !( lastRows > matrix.power( 2, matrix.logWarpSize - 1 - emptyGroups ) ) ) - emptyGroups++; - for( Index group = groupBegin; group < groupBegin + emptyGroups; group++ ) - matrix.groupPointers.setElement( group, 0 ); - } - i += emptyGroups; - for( Index group = groupBegin + emptyGroups; group < groupBegin + matrix.logWarpSize; group++ ) - { - Index row = matrix.power( 2, 4 - i ); - Index permRow = 0; - while( matrix.rowPermArray.getElement( permRow + rowBegin ) != row + rowBegin ) - permRow++; - Index temp = rowLengths.getElement( permRow + rowBegin ); - for( Index prevGroups = groupBegin; prevGroups < group; prevGroups++ ) - temp -= matrix.power( 2, prevGroups - groupBegin ) * matrix.groupPointers.getElement( prevGroups ); - temp = ceil( ( float ) temp / matrix.power( 2, i ) ); - matrix.groupPointers.setElement( group, temp ); - i++; - } - Index permRow = rowBegin; - while( matrix.rowPermArray.getElement( permRow ) != rowBegin ) - permRow++; - Index temp = rowLengths.getElement( permRow ); - for( Index prevGroups = groupBegin; prevGroups < groupBegin + matrix.logWarpSize; prevGroups++ ) - temp -= matrix.power( 2, prevGroups - groupBegin ) * matrix.groupPointers.getElement( prevGroups ); - temp = ceil( ( float ) temp / matrix.power( 2, matrix.logWarpSize ) ); - matrix.groupPointers.setElement( groupBegin + matrix.logWarpSize, temp ); - } - } - - template< typename Real, - typename Index, - int StripSize > - static void performRowBubbleSort( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths - /*Containers::Vector< Index, Device, Index >& tempRowLengths*/ ) - { - Index strips = matrix.virtualRows / matrix.warpSize; - for( Index i = 0; i < strips; i++ ) - { - Index begin = i * matrix.warpSize; - Index end = ( i + 1 ) * matrix.warpSize - 1; - if(matrix.getRows() - 1 < end) - end = matrix.getRows() - 1; - bool sorted = false; - Index permIndex1, permIndex2, offset = 0; - while( !sorted ) - { - sorted = true; - for( Index j = begin + offset; j < end - offset; j++ ) - { - for( Index k = begin; k < end + 1; k++ ) - { - if( matrix.rowPermArray.getElement( k ) == j ) - permIndex1 = k; - if( matrix.rowPermArray.getElement( k ) == j + 1 ) - permIndex2 = k; - } - if( rowLengths.getElement( permIndex1 ) < rowLengths.getElement( permIndex2 ) ) - { - Index temp = matrix.rowPermArray.getElement( permIndex1 ); - matrix.rowPermArray.setElement( permIndex1, matrix.rowPermArray.getElement( permIndex2 ) ); - matrix.rowPermArray.setElement( permIndex2, temp ); - sorted = false; - } - } - for( Index j = end - 1 - offset; j > begin + offset; j-- ) - { - for( Index k = begin; k < end + 1; k++ ) - { - if( matrix.rowPermArray.getElement( k ) == j ) - permIndex1 = k; - if( matrix.rowPermArray.getElement( k ) == j - 1 ) - permIndex2 = k; - } - if( rowLengths.getElement( permIndex2 ) < rowLengths.getElement( permIndex1 ) ) - { - Index temp = matrix.rowPermArray.getElement( permIndex1 ); - matrix.rowPermArray.setElement( permIndex1, matrix.rowPermArray.getElement( permIndex2 ) ); - matrix.rowPermArray.setElement( permIndex2, temp ); - sorted = false; - } - } - offset++; - } - } - } -}; - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int StripSize > -template< typename InVector, - typename OutVector > -__device__ -void BiEllpackSymmetric< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector, - OutVector& outVector, - int globalIdx ) const -{ - const IndexType strip = globalIdx >> this->logWarpSize; - const IndexType warpStart = strip << this->logWarpSize; - const IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); - - if( warpStart >= this->getRows() ) - return; - - const IndexType cudaBlockSize = 256; - IndexType bisection = this->warpSize; - IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - - Real* temp = Cuda::getSharedMemory< Real >(); - __shared__ Real results[ cudaBlockSize ]; - results[ threadIdx.x ] = 0.0; - IndexType elementPtr = ( this->groupPointers[ groupBegin ] << this->logWarpSize ) + inWarpIdx; - - for( IndexType group = 0; group < this->logWarpSize + 1; group++ ) - { - temp[ threadIdx.x ] = 0.0; - IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - outVector.add( this->columnIndexes[ elementPtr ], inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ] ); - elementPtr += this->warpSize; - } - IndexType bisection2 = this->warpSize; - for( IndexType i = 0; i < group; i++ ) - { - bisection2 >>= 1; - if( inWarpIdx < bisection2 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + bisection2 ]; - } - if( inWarpIdx < bisection ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - bisection >>= 1; - } - __syncthreads(); - if( warpStart + inWarpIdx >= this->getRows() ) - return; - outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ]; -} -#endif - -/*#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int StripSize > -template< typename InVector, - typename OutVector > -__device__ -void BiEllpackSymmetric< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector, - OutVector& outVector, - int globalIdx ) const -{ - // Loop unrolling test - const IndexType strip = globalIdx >> this->logWarpSize; - const IndexType warpStart = strip << this->logWarpSize; - const IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); - - if( warpStart >= this->getRows() ) - return; - - const IndexType cudaBlockSize = 256; - - volatile Real* temp = getSharedMemory< Real >(); - __shared__ Real results[ cudaBlockSize ]; - results[ threadIdx.x ] = 0.0; - IndexType elementPtr = ( this->groupPointers[ strip * ( this->logWarpSize + 1 ) ] << this->logWarpSize ) + inWarpIdx; - - //Loop Unroll #1 - IndexType group = 0; - IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - results[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - } - - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #2 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 16 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - - //group == 2; - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #3 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 8 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ]; - if( inWarpIdx < 8 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - //group == 3; - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #4 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 8 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ]; - if( inWarpIdx < 4 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ]; - if( inWarpIdx < 4 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - //group == 4; - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #5 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 8 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ]; - if( inWarpIdx < 4 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ]; - if( inWarpIdx < 2 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ]; - if( inWarpIdx < 2 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - //group == 5 - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #6 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 8 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ]; - if( inWarpIdx < 4 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ]; - if( inWarpIdx < 2 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ]; - if( inWarpIdx < 1 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 1 ]; - if( inWarpIdx < 1 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - if( warpStart + inWarpIdx >= this->getRows() ) - return; - outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ]; -} -#endif*/ - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int StripSize, - typename InVector, - typename OutVector > -__global__ -void BiEllpackSymmetricVectorProductCuda( const BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >* matrix, - const InVector* inVector, - OutVector* outVector, - int gridIdx, - const int warpSize ) -{ - Index globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - matrix->spmvCuda( *inVector, *outVector, globalIdx ); -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int StripSize > -__device__ -void BiEllpackSymmetric< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, - const IndexType strip ) -{ - IndexType begin = strip * this->warpSize; - IndexType end = ( strip + 1 ) * this->warpSize - 1; - if( this->getRows() - 1 < end ) - end = this->getRows() - 1; - bool sorted = false; - IndexType permIndex1, permIndex2, offset = 0; - while( !sorted ) - { - sorted = true; - for( IndexType j = begin + offset; j < end - offset; j++ ) - { - for( IndexType k = begin; k < end + 1; k++) - { - if( this->rowPermArray[ k ] == j ) - permIndex1 = k; - if( this->rowPermArray[ k ] == j + 1 ) - permIndex2 = k; - } - if( rowLengths[ permIndex1 ] < rowLengths[ permIndex2 ] ) - { - IndexType temp = this->rowPermArray[ permIndex1 ]; - this->rowPermArray[ permIndex1 ] = this->rowPermArray[ permIndex2 ]; - this->rowPermArray[ permIndex2 ] = temp; - sorted = false; - } - } - for( IndexType j = end - 1 - offset; j > begin + offset; j-- ) - { - for( IndexType k = begin; k < end + 1; k++ ) - { - if( this->rowPermArray[ k ] == j ) - permIndex1 = k; - if( this->rowPermArray[ k ] == j - 1) - permIndex2 = k; - } - if( rowLengths[ permIndex2 ] < rowLengths[ permIndex1 ] ) - { - IndexType temp = this->rowPermArray[ permIndex1 ]; - this->rowPermArray[ permIndex1 ] = this->rowPermArray[ permIndex2 ]; - this->rowPermArray[ permIndex2 ] = temp; - sorted = false; - } - } - offset++; - } -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int StripSize > -__device__ -void BiEllpackSymmetric< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, - const IndexType numberOfStrips, - const IndexType strip ) -{ - if( strip >= numberOfStrips ) - return; - IndexType i = 0; - IndexType rowBegin = strip * this->warpSize; - IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - IndexType emptyGroups = 0; - if( strip == numberOfStrips - 1 ) - { - IndexType lastRows = this->getRows() - rowBegin; - while( !( lastRows > this->power( 2, this->logWarpSize - 1 - emptyGroups ) ) ) - emptyGroups++; - for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ ) - this->groupPointers[ group ] = 0; - } - i += emptyGroups; - for( IndexType group = groupBegin + emptyGroups; group < groupBegin + this->logWarpSize; group++ ) - { - IndexType row = this->power( 2, 4 - i ); - IndexType permRow = 0; - while( this->rowPermArray[ permRow + rowBegin ] != row + rowBegin && permRow < this->warpSize ) - permRow++; - IndexType temp = rowLengths[ permRow + rowBegin ]; - for( IndexType prevGroups = groupBegin; prevGroups < group; prevGroups++ ) - temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers[ prevGroups ]; - temp = ceil( ( float ) temp / this->power( 2, i ) ); - this->groupPointers[ group ] = temp; - i++; - } - IndexType permRow = rowBegin; - while( this->rowPermArray[ permRow ] != rowBegin && permRow < this->warpSize + rowBegin ) - permRow++; - IndexType temp = rowLengths[ permRow ]; - for( IndexType prevGroups = groupBegin; prevGroups < groupBegin + this->logWarpSize; prevGroups++ ) - temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers[ prevGroups ]; - temp = ceil( ( float ) temp / this->power( 2, this->logWarpSize ) ); - this->groupPointers[ groupBegin + this->logWarpSize ] = temp; -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int StripSize > -__global__ -void performRowBubbleSortCuda( BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >* matrix, - const typename BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >::RowLengthsVector* rowLengths, - int gridIdx ) -{ - const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; - matrix->performRowBubbleSortCudaKernel( *rowLengths, stripIdx ); -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int StripSize > -__global__ -void computeColumnSizesCuda( BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >* matrix, - const typename BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >::RowLengthsVector* rowLengths, - const Index numberOfStrips, - int gridIdx ) -{ - const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; - matrix->computeColumnSizesCudaKernel( *rowLengths, numberOfStrips, stripIdx ); -} -#endif - -template<> -class BiEllpackSymmetricDeviceDependentCode< Devices::Cuda > -{ -public: - - typedef Devices::Cuda Device; - - template< typename Real, - typename Index, - int StripSize > - static void verifyRowLengths( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { - bool ok = true; - std::cout << "inside method" < rowLength ) - ok = false; - } - if( ok ) - std::cout << "row lengths OK" < - static void verifyRowPerm( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { - bool ok = true; - Index numberOfStrips = matrix.virtualRows / matrix.warpSize; - for( Index strip = 0; strip < numberOfStrips; strip++ ) - { - Index begin = strip * matrix.warpSize; - Index end = ( strip + 1 ) * matrix.warpSize; - if( matrix.getRows() < end ) - end = matrix.getRows(); - for( Index i = begin; i < end - 1; i++ ) - { - Index permIndex1, permIndex2; - bool first = false; - bool second = false; - for( Index j = begin; j < end; j++ ) - { - if( matrix.rowPermArray.getElement( j ) == i ) - { - permIndex1 = j; - first = true; - } - if( matrix.rowPermArray.getElement( j ) == i + 1 ) - { - permIndex2 = j; - second = true; - } - } - if( !first || !second ) - std::cout << "nenasel jsem spravne indexy" <= rowLengths.getElement( permIndex2 ) ) - continue; - else - ok = false; - } - } - if( ok ) - std::cout << "perm OK" < - static void performRowBubbleSort( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { -#ifdef HAVE_CUDA - Index numberOfStrips = matrix.virtualRows / StripSize; - typedef BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize > Matrix; - typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - performRowBubbleSortCuda< Real, Index, StripSize > - <<< cudaGridSize, cudaBlockSize >>> - ( kernel_this, - kernel_rowLengths, - gridIdx ); - } - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_rowLengths ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - - template< typename Real, - typename Index, - int StripSize > - static void computeColumnSizes( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { -#ifdef HAVE_CUDA - const Index numberOfStrips = matrix.virtualRows / StripSize; - typedef BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize > Matrix; - typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - computeColumnSizesCuda< Real, Index, StripSize > - <<< cudaGridSize, cudaBlockSize >>> - ( kernel_this, - kernel_rowLengths, - numberOfStrips, - gridIdx ); - } - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_rowLengths ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - - - template< typename Real, - typename Index, - int StripSize, - typename InVector, - typename OutVector > - static void vectorProduct( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const InVector& inVector, - OutVector& outVector ) - { -#ifdef HAVE_CUDA - typedef BiEllpackSymmetric< Real, Devices::Cuda, Index > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - const int sharedMemory = cudaBlockSize.x * sizeof( Real ); - BiEllpackSymmetricVectorProductCuda< Real, Index, StripSize, InVector, OutVector > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx, - matrix.warpSize ); - } - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_inVector ); - Cuda::freeFromDevice( kernel_outVector ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetric.h b/src/TNL/Matrices/Legacy/EllpackSymmetric.h deleted file mode 100644 index af3c2e4a8..000000000 --- a/src/TNL/Matrices/Legacy/EllpackSymmetric.h +++ /dev/null @@ -1,190 +0,0 @@ -/*************************************************************************** - EllpackSymmetric.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Device > -class EllpackSymmetricDeviceDependentCode; - -template< typename Real, typename Device = Devices::Host, typename Index = int > -class EllpackSymmetric : public Sparse< Real, Device, Index > -{ - public: - - typedef Real RealType; - typedef Device DeviceType; - typedef Index IndexType; - typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView; - typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; - - template< typename _Real = Real, - typename _Device = Device, - typename _Index = Index > - using Self = EllpackSymmetric< _Real, _Device, _Index >; - - EllpackSymmetric(); - - void setDimensions( const IndexType rows, - const IndexType columns ); - - void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ); - - bool setConstantRowLengths( const IndexType& rowLengths ); - - IndexType getRowLength( const IndexType row ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool setLike( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ); - - void reset(); - - template< typename Real2, typename Device2, typename Index2 > - bool operator == ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool operator != ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const; - - /*template< typename Matrix > - bool copyFrom( const Matrix& matrix, - const CompressedRowLengthsVector& rowLengths );*/ - - __cuda_callable__ - bool setElementFast( const IndexType row, - const IndexType column, - const RealType& value ); - - bool setElement( const IndexType row, - const IndexType column, - const RealType& value ); - - __cuda_callable__ - bool addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - - __cuda_callable__ - bool setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - bool setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - - __cuda_callable__ - bool addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - RealType getElementFast( const IndexType row, - const IndexType column ) const; - - RealType getElement( const IndexType row, - const IndexType column ) const; - - __cuda_callable__ - void getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const; - - void getRow( const IndexType row, - IndexType* columns, - RealType* values ) const; - - template< typename Vector > - __cuda_callable__ - typename Vector::RealType rowVectorProduct( const IndexType row, - const Vector& vector ) const; - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const; - - template< typename InVector, - typename OutVector > - void vectorProductHost( const InVector& inVector, - OutVector& outVector ) const; - - template< typename Real2, typename Index2 > - void addMatrix( const EllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator = 1.0, - const RealType& thisMatrixMultiplicator = 1.0 ); - - template< typename Real2, typename Index2 > - void getTransposition( const EllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator = 1.0 ); - - template< typename Vector > - bool performSORIteration( const Vector& b, - const IndexType row, - Vector& x, - const RealType& omega = 1.0 ) const; - - void save( File& file ) const; - - void load( File& file ); - - void save( const String& fileName ) const; - - void load( const String& fileName ); - - void print( std::ostream& str ) const; - - template< typename InVector, - typename OutVector > - __cuda_callable__ - void spmvCuda( const InVector& inVector, - OutVector& outVector, - int rowIdx ) const; - - protected: - - void allocateElements(); - - IndexType rowLengths, alignedRows; - - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DeviceDependentCode; - friend class EllpackSymmetricDeviceDependentCode< DeviceType >; -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL - -#include diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h b/src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h deleted file mode 100644 index dd42b7f26..000000000 --- a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h +++ /dev/null @@ -1,212 +0,0 @@ -/*************************************************************************** - EllpackSymmetricGraph.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Device > -class EllpackSymmetricGraphDeviceDependentCode; - -template< typename Real, typename Device = Devices::Host, typename Index = int > -class EllpackSymmetricGraph : public Sparse< Real, Device, Index > -{ - public: - - typedef Real RealType; - typedef Device DeviceType; - typedef Index IndexType; - typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView; - typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; - - template< typename _Real = Real, - typename _Device = Device, - typename _Index = Index > - using Self = EllpackSymmetricGraph< _Real, _Device, _Index >; - - EllpackSymmetricGraph(); - - void setDimensions( const IndexType rows, - const IndexType columns ); - - void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ); - - bool setConstantRowLengths( const IndexType& rowLengths ); - - IndexType getRowLength( const IndexType row ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool setLike( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ); - - void reset(); - - //template< typename Real2, typename Device2, typename Index2 > - //bool operator == ( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const; - - //template< typename Real2, typename Device2, typename Index2 > - //bool operator != ( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const; - - /*template< typename Matrix > - bool copyFrom( const Matrix& matrix, - const CompressedRowLengthsVector& rowLengths );*/ - - __cuda_callable__ - bool setElementFast( const IndexType row, - const IndexType column, - const RealType& value ); - - bool setElement( const IndexType row, - const IndexType column, - const RealType& value ); - - __cuda_callable__ - bool addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - - __cuda_callable__ - bool setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - bool setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - - __cuda_callable__ - bool addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - RealType getElementFast( const IndexType row, - const IndexType column ) const; - - RealType getElement( const IndexType row, - const IndexType column ) const; - - __cuda_callable__ - void getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const; - - void getRow( const IndexType row, - IndexType* columns, - RealType* values ) const; - - template< typename Vector > - __cuda_callable__ - typename Vector::RealType rowVectorProduct( const IndexType row, - const Vector& vector ) const; - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const; - - template< typename InVector, - typename OutVector > - void vectorProductHost( const InVector& inVector, - OutVector& outVector ) const; - -#ifdef HAVE_CUDA - template< typename InVector, - typename OutVector > - __cuda_callable__ - void spmvCuda( const InVector& inVector, - OutVector& outVector, - const int globalIdx, - const int color ) const; -#endif - - void computePermutationArray(); - - bool rearrangeMatrix( bool verbose ); - - void save( File& file ) const; - - void load( File& file ); - - void save( const String& fileName ) const; - - void load( const String& fileName ); - - void print( std::ostream& str ) const; - - bool help( bool verbose = false ); - - void verifyPermutationArray(); - - __cuda_callable__ - Index getRowLengthsInt() const; - - __cuda_callable__ - Index getAlignedRows() const; - - __cuda_callable__ - Index getRowsOfColor( IndexType color ) const; - - void copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix ); - - __cuda_callable__ - Containers::Vector< Index, Device, Index >& getPermutationArray(); - - __cuda_callable__ - Containers::Vector< Index, Device, Index >& getInversePermutation(); - - __cuda_callable__ - Containers::Vector< Index, Device, Index >& getColorPointers(); - - protected: - - void allocateElements(); - - IndexType rowLengths, alignedRows; - - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DeviceDependentCode; - friend class EllpackSymmetricGraphDeviceDependentCode< DeviceType >; - - Containers::Vector< Index, Device, Index > permutationArray; - Containers::Vector< Index, Device, Index > inversePermutationArray; - Containers::Vector< Index, Device, Index > colorPointers; - bool rearranged; -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL - - -#include diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h b/src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h deleted file mode 100644 index 6f5419196..000000000 --- a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h +++ /dev/null @@ -1,1044 +0,0 @@ -/*************************************************************************** - EllpackSymmetricGraph_impl.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Real, - typename Device, - typename Index > -EllpackSymmetricGraph< Real, Device, Index > :: EllpackSymmetricGraph() -: rowLengths( 0 ), alignedRows( 0 ), rearranged( false ) -{ -}; - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Index EllpackSymmetricGraph< Real, Device, Index >::getRowLengthsInt() const -{ - return this->rowLengths; -} - -template< typename Real, - typename Device, - typename Index > -Index EllpackSymmetricGraph< Real, Device, Index >::getAlignedRows() const -{ - return this->alignedRows; -} - -template< typename Real, - typename Device, - typename Index > -String EllpackSymmetricGraph< Real, Device, Index > :: getType() -{ - return String( "Matrices::EllpackSymmetricGraph< ") + - String( TNL::getType< Real >() ) + - String( ", " ) + - String( Device::getDeviceType() ) + - String( ", " ) + - String( TNL::getType< Index >() ) + - String( " >" ); -} - -template< typename Real, - typename Device, - typename Index > -String EllpackSymmetricGraph< Real, Device, Index >::getTypeVirtual() const -{ - return this->getType(); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::setDimensions( const IndexType rows, - const IndexType columns ) -{ - TNL_ASSERT( rows > 0 && columns > 0, - std::cerr << "rows = " << rows - << " columns = " << columns << std::endl ); - - this->rows = rows; - this->columns = columns; - - if( std::is_same< DeviceType, Devices::Cuda >::value ) - { - this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() ); - - if( this->rows - this->alignedRows > 0 ) - { - IndexType missingRows = this->rows - this->alignedRows; - missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() ); - this->alignedRows += missingRows; - -// this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() ); - } - } - else this->alignedRows = rows; - - if( this->rowLengths != 0 ) - allocateElements(); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) -{ - TNL_ASSERT( this->getRows() > 0, ); - TNL_ASSERT( this->getColumns() > 0, ); - //TNL_ASSERT( this->rowLengths > 0, - // std::cerr << "this->rowLengths = " << this->rowLengths ); - this->rowLengths = this->maxRowLength = max( rowLengths ); - this->permutationArray.setSize( this->getRows() ); - for( IndexType i = 0; i < this->getRows(); i++ ) - this->permutationArray.setElement( i, i ); - allocateElements(); -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Index EllpackSymmetricGraph< Real, Device, Index >::getRowsOfColor( IndexType color ) const -{ - return this->colorPointers[ color + 1 ] - this->colorPointers[ color ]; -} - -/* -template< typename Real, - typename Device, - typename Index > -#ifdef HAVE_CUDA -__device__ __host__ -#endif -void EllpackSymmetricGraph< Real, Device, Index >::computeColorsVector( Containers::Vector< Index, Device, Index >& colorsVector ) -{ - this->numberOfColors = 0; - - for( IndexType i = this->getRows() - 1; i >= 0; i-- ) - { - // init color array - Containers::Vector< Index, Device, Index > usedColors; - usedColors.setSize( this->numberOfColors ); - for( IndexType j = 0; j < this->numberOfColors; j++ ) - usedColors.setElement( j, 0 ); - - // find all colors used in given row - - // optimization: - // load the whole row in sparse format - // traverse it while don't hit the padding index or end of the row - // for each nonzero element write -> usedColors.setElement( colorsVector.getElement( column ), 1 ) - IndexType* columns = new IndexType[ this->getRowLength( i ) ]; - RealType* values = new RealType[ this->getRowLength( i ) ]; - this->getRow( i, columns, values ); - for( IndexType j = 0; j < this->getRowLength( i ); j++ ) - { - // we are only interested in symmetric part of the matrix - if( columns[ j ] < i + 1 ) - continue; - - // if we hit padding index, there is no reason to continue iterations - if( columns[ j ] == this->getPaddingIndex() ) - break; - - usedColors.setElement( colorsVector.getElement( columns[ j ] ), 1 ); - } - delete [] columns; - delete [] values; - - - //for( IndexType j = i + 1; j < this->getColumns(); j++ ) - // if( this->getElement( i, j ) != 0.0 ) - // usedColors.setElement( colorsVector.getElement( j ), 1 ); - - // find unused color - bool found = false; - for( IndexType j = 0; j < this->numberOfColors; j++ ) - if( usedColors.getElement( j ) == 0 ) - { - colorsVector.setElement( i, j ); - found = true; - break; - } - if( !found ) - { - colorsVector.setElement( i, this->numberOfColors ); - this->numberOfColors++; - } - } -} -*/ - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::computePermutationArray() -{ - // init vector of colors and permutation array - Containers::Vector< Index, Device, Index > colorsVector; - colorsVector.setSize( this->getRows() ); - for( IndexType i = 0; i < this->getRows(); i++ ) - { - colorsVector.setElement( i, 0 ); - } - - // compute colors for each row - Matrix< Real, Device, Index >::computeColorsVector( colorsVector ); - - // init color pointers - this->colorPointers.setSize( this->getNumberOfColors() + 1 ); - - // compute permutation - IndexType position = 0; - for( IndexType color = 0; color < this->getNumberOfColors(); color++ ) - { - this->colorPointers.setElement( color, position ); - for (IndexType i = 0; i < this->getRows(); i++) - if ( colorsVector.getElement( i ) == color) - { - IndexType row1 = this->permutationArray.getElement( i ); - IndexType row2 = this->permutationArray.getElement( position ); - IndexType tmp = this->permutationArray.getElement( row1 ); - this->permutationArray.setElement( row1, this->permutationArray.getElement( row2 ) ); - this->permutationArray.setElement( row2, tmp ); - - tmp = colorsVector.getElement( position ); - colorsVector.setElement( position, colorsVector.getElement( i ) ); - colorsVector.setElement( i, tmp ); - position++; - } - } - - this->colorPointers.setElement( this->getNumberOfColors(), this->getRows() ); - - // destroy colors vector - colorsVector.reset(); - - this->inversePermutationArray.setSize( this->getRows() ); - for( IndexType row = 0; row < this->getRows(); row++ ) - this->inversePermutationArray.setElement( this->permutationArray.getElement( row ), row ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::verifyPermutationArray() -{ - for( IndexType i = 0; i < this->getRows(); i++ ) - if( this->permutationArray.getElement( i ) >= this->getRows() ) - { - std::cerr << "There is wrong data in permutationArray position " << i << std::endl; - break; - } -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index >::rearrangeMatrix( bool verbose ) -{ - // first we need to know permutation - this->computePermutationArray(); - if( verbose ) - this->verifyPermutationArray(); - - // then we need to create new matrix - Containers::Vector< Real, Device, Index > valuesVector; - Containers::Vector< Index, Device, Index > columnsVector; - valuesVector.setSize( this->values.getSize() ); - columnsVector.setSize( this->columnIndexes.getSize() ); - valuesVector.setValue( 0.0 ); - columnsVector.setValue( this->getPaddingIndex() ); - - for( IndexType row = 0; row < this->getRows(); row++ ) - { - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtrOrig = DDCType::getRowBegin( *this, row ); - IndexType elementPtrNew = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) ); - IndexType rowEnd = DDCType::getRowEnd( *this, row ); - IndexType step = DDCType::getElementStep( *this ); - - for( IndexType i = 0; i < this->rowLengths; i++ ) - { - if( this->columnIndexes.getElement( elementPtrOrig ) <= row ) - { - valuesVector.setElement(elementPtrNew, this->values.getElement(elementPtrOrig)); - columnsVector.setElement(elementPtrNew, this->columnIndexes.getElement(elementPtrOrig)); - elementPtrNew += step; - } - elementPtrOrig += step; - } - } - - // reset original matrix - this->values.reset(); - this->columnIndexes.reset(); - - // deep copy new matrix - this->values.setSize( valuesVector.getSize() ); - this->columnIndexes.setSize( columnsVector.getSize() ); - this->values = valuesVector; - this->columnIndexes = columnsVector; - - // clear memory - valuesVector.reset(); - columnsVector.reset(); - - this->rearranged = true; - return true; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Containers::Vector< Index, Device, Index >& -EllpackSymmetricGraph< Real, Device, Index >::getPermutationArray() -{ - return this->permutationArray; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Containers::Vector< Index, Device, Index >& -EllpackSymmetricGraph< Real, Device, Index >::getInversePermutation() -{ - return this->inversePermutationArray; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Containers::Vector< Index, Device, Index >& -EllpackSymmetricGraph< Real, Device, Index >::getColorPointers() -{ - return this->colorPointers; -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix ) -{ - // TODO: fix - //Sparse< Real, Device, Index >::copyFromHostToCuda( matrix ); - - this->rearranged = true; - this->rowLengths = matrix.getRowLengthsInt(); - this->alignedRows = matrix.getAlignedRows(); - Containers::Vector< Index, Devices::Host, Index >& colorPointers = matrix.getColorPointers(); - this->colorPointers.setSize( colorPointers.getSize() ); - for( IndexType i = 0; i < colorPointers.getSize(); i++ ) - this->colorPointers.setElement( i, colorPointers[ i ] ); - - Containers::Vector< Index,Devices::Host, Index >& permutationArray = matrix.getPermutationArray(); - this->permutationArray.setSize( permutationArray.getSize() ); - for( IndexType i = 0; i < permutationArray.getSize(); i++ ) - this->permutationArray.setElement( i, permutationArray[ i ] ); - - Containers::Vector< Index, Devices::Host, Index >& inversePermutation = matrix.getInversePermutation(); - this->inversePermutationArray.setSize( inversePermutation.getSize() ); - for( IndexType i = 0; i < inversePermutation.getSize(); i++ ) - this->inversePermutationArray.setElement( i, inversePermutation[ i ] ); - - for( IndexType i = 0; i < this->getRows(); i++ ) - for( IndexType j = 0; j <= i; j++ ) - if( matrix.getElement( i, j ) != 0.0 ) - this->setElementFast( i, j, matrix.getElement( i, j ) ); - - colorPointers.reset(); - permutationArray.reset(); -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index >::setConstantRowLengths( const IndexType& rowLengths ) -{ - TNL_ASSERT( rowLengths > 0, std::cerr << " rowLengths = " << rowLengths ); - this->rowLengths = rowLengths; - if( this->rows > 0 ) - allocateElements(); - return true; -} - -template< typename Real, - typename Device, - typename Index > -Index EllpackSymmetricGraph< Real, Device, Index >::getRowLength( const IndexType row ) const -{ - return this->rowLengths; -} - -template< typename Real, - typename Device, - typename Index > - template< typename Real2, - typename Device2, - typename Index2 > -bool EllpackSymmetricGraph< Real, Device, Index >::setLike( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) -{ - if( ! Sparse< Real, Device, Index >::setLike( matrix ) || - ! this->permutationArray.setLike( matrix.permutationArray ) || - ! this->colorPointers.setLike( matrix.colorPointers ) ) - return false; - this->rowLengths = matrix.rowLengths; - this->numberOfColors = matrix.getNumberOfColors(); - return true; -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index > :: reset() -{ - Sparse< Real, Device, Index >::reset(); - this->permutationArray.reset(); - this->colorPointers.reset(); - this->rowLengths = 0; -} - -/*template< typename Real, - typename Device, - typename Index > - template< typename Matrix > -bool EllpackSymmetricGraph< Real, Device, Index >::copyFrom( const Matrix& matrix, - const CompressedRowLengthsVector& rowLengths ) -{ - return tnlMatrix< RealType, DeviceType, IndexType >::copyFrom( matrix, rowLengths ); -}*/ - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetricGraph< Real, Device, Index > :: setElementFast( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElementFast( row, column, value, 0.0 ); -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index > :: setElement( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElement( row, column, value, 0.0 ); -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetricGraph< Real, Device, Index > :: addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType i = DDCType::getRowBegin( *this, this->permutationArray[ row ] ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] ); - const IndexType step = DDCType::getElementStep( *this ); - - while( i < rowEnd && - this->columnIndexes[ i ] < column && - this->columnIndexes[ i ] != this->getPaddingIndex() ) i += step; - if( i == rowEnd ) - return false; - if( this->columnIndexes[ i ] == column ) - { - this->values[ i ] = thisElementMultiplicator * this->values[ i ] + value; - return true; - } - else - if( this->columnIndexes[ i ] == this->getPaddingIndex() ) // artificial zero - { - this->columnIndexes[ i ] = column; - this->values[ i ] = value; - } - else - { - Index j = rowEnd - step; - while( j > i ) - { - this->columnIndexes[ j ] = this->columnIndexes[ j - step ]; - this->values[ j ] = this->values[ j - step ]; - j -= step; - } - this->columnIndexes[ i ] = column; - this->values[ i ] = value; - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index > :: addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType i = DDCType::getRowBegin( *this, this->permutationArray[ row ] ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] ); - const IndexType step = DDCType::getElementStep( *this ); - - while( i < rowEnd && - this->columnIndexes.getElement( i ) < column && - this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) i += step; - if( i == rowEnd ) - return false; - if( this->columnIndexes.getElement( i ) == column ) - { - this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value ); - return true; - } - else - if( this->columnIndexes.getElement( i ) == this->getPaddingIndex() ) - { - this->columnIndexes.setElement( i, column ); - this->values.setElement( i, value ); - } - else - { - IndexType j = rowEnd - step; - while( j > i ) - { - this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) ); - this->values.setElement( j, this->values.getElement( j - step ) ); - j -= step; - } - this->columnIndexes.setElement( i, column ); - this->values.setElement( i, value ); - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetricGraph< Real, Device, Index > :: setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPointer = DDCType::getRowBegin( *this, this->permutationArray[ row ] ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] ); - const IndexType step = DDCType::getElementStep( *this ); - - if( elements > this->rowLengths ) - return false; - for( Index i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes[ elementPointer ] = column; - this->values[ elementPointer ] = values[ i ]; - elementPointer += step; - } - for( Index i = elements; i < this->rowLengths; i++ ) - { - this->columnIndexes[ elementPointer ] = this->getPaddingIndex(); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index > :: setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPointer = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) ); - const IndexType step = DDCType::getElementStep( *this ); - - if( elements > this->rowLengths ) - return false; - - for( IndexType i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes.setElement( elementPointer, column ); - this->values.setElement( elementPointer, values[ i ] ); - elementPointer += step; - } - for( IndexType i = elements; i < this->rowLengths; i++ ) - { - this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() ); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetricGraph< Real, Device, Index > :: addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - // TODO: implement - return false; -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index > :: addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - return this->addRowFast( row, columns, values, numberOfElements ); -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Real EllpackSymmetricGraph< Real, Device, Index >::getElementFast( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElementFast( column, row ); - - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) ); - const IndexType step = DDCType::getElementStep( *this ); - - while( elementPtr < rowEnd && - this->columnIndexes.getElement( elementPtr ) < column && - this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column ) - return this->values.getElement( elementPtr ); - return 0.0; -} - -template< typename Real, - typename Device, - typename Index > -Real EllpackSymmetricGraph< Real, Device, Index >::getElement( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElement( column, row ); - - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) ); - const IndexType step = DDCType::getElementStep( *this ); - - while( elementPtr < rowEnd && - this->columnIndexes.getElement( elementPtr ) < column && - this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) - { - elementPtr += step; - } - if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column ) - return this->values.getElement( elementPtr ); - return 0.0; -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -void EllpackSymmetricGraph< Real, Device, Index >::getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray[ row ] ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] ); - const IndexType step = DDCType::getElementStep( *this ); - - for( IndexType i = 0; i < this->rowLengths; i++ ) - { - columns[ i ] = this->columnIndexes[ elementPtr ]; - values[ i ] = this->values[ elementPtr ]; - elementPtr += step; - } -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::getRow( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray[ row ] ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] ); - const IndexType step = DDCType::getElementStep( *this ); - - for( IndexType i = 0; i < this->rowLengths; i++ ) - { - columns[ i ] = this->columnIndexes.getElement( elementPtr ); - values[ i ] = this->values.getElement( elementPtr ); - elementPtr += step; - } -} - -template< typename Real, - typename Device, - typename Index > - template< typename Vector > -__cuda_callable__ -typename Vector::RealType EllpackSymmetricGraph< Real, Device, Index >::rowVectorProduct( const IndexType row, - const Vector& vector ) const -{ - IndexType i = DeviceDependentCode::getRowBegin( *this, row ); - const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row ); - const IndexType step = DeviceDependentCode::getElementStep( *this ); - - Real result = 0.0; - while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() ) - { - const Index column = this->columnIndexes[ i ]; - result += this->values[ i ] * vector[ column ]; - i += step; - } - return result; -} - -template< typename Real, - typename Device, - typename Index > - template< typename InVector, - typename OutVector > -void EllpackSymmetricGraph< Real, Device, Index >::vectorProduct( const InVector& inVector, - OutVector& outVector ) const -{ - DeviceDependentCode::vectorProduct( *this, inVector, outVector ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::save( File& file ) const -{ - Sparse< Real, Device, Index >::save( file); - file.save( &this->rowLengths ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::load( File& file ) -{ - Sparse< Real, Device, Index >::load( file); - file.load( &this->rowLengths ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::save( const String& fileName ) const -{ - Object::save( fileName ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::load( const String& fileName ) -{ - Object::load( fileName ); -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index >::help( bool verbose ) -{ - if( !this->rearranged ) - return this->rearrangeMatrix( verbose ); - return true; -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::print( std::ostream& str ) const -{ - for( IndexType row = 0; row < this->getRows(); row++ ) - { - str <<"Row: " << row << " -> "; - IndexType i( row * this->rowLengths ); - const IndexType rowEnd( i + this->rowLengths ); - while( i < rowEnd && - this->columnIndexes.getElement( i ) < this->columns && - this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) - { - const Index column = this->columnIndexes.getElement( i ); - str << " Col:" << column << "->" << this->values.getElement( i ) << "\t"; - i++; - } - str << std::endl; - } -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::allocateElements() -{ - IndexType numberOfMatrixElements = this->alignedRows * this->rowLengths; - - TNL_ASSERT_TRUE( this->alignedRows != 0 && numberOfMatrixElements / this->alignedRows == this->rowLengths, - "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" ); - - Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths ); -} - -template< typename Real, - typename Device, - typename Index > -template< typename InVector, - typename OutVector > -void EllpackSymmetricGraph< Real, Device, Index >::vectorProductHost( const InVector& inVector, - OutVector& outVector ) const -{ - for( IndexType color = 0; color < this->getNumberOfColors(); color++ ) - { - // IndexType colorBegin = this->colorPointers[ color ]; - IndexType offset = this->colorPointers[ color ]; - IndexType colorEnd = this->colorPointers[ color + 1 ]; - for( IndexType j = 0; j < this->getRowsOfColor( color ); j++ ) - { - IndexType row = offset + j; - if( row >= colorEnd ) - break; - IndexType i = DeviceDependentCode::getRowBegin( *this, row ); - const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row ); - const IndexType step = DeviceDependentCode::getElementStep( *this ); - const IndexType rowMapping = this->inversePermutationArray[ row ]; - - while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() ) - { - const IndexType column = this->columnIndexes[ i ]; - outVector[ rowMapping ] += this->values[ i ] * inVector[ column ]; - if( rowMapping != column ) - outVector[ column ] += this->values[ i ] * inVector[ rowMapping ]; - i += step; - } - } - } -} - -template<> -class EllpackSymmetricGraphDeviceDependentCode< Devices::Host > -{ - public: - - typedef Devices::Host Device; - - template< typename Real, - typename Index > - static Index getRowBegin( const EllpackSymmetricGraph< Real, Device, Index >& matrix, - const Index row ) - { - return row * matrix.rowLengths; - } - - template< typename Real, - typename Index > - static Index getRowEnd( const EllpackSymmetricGraph< Real, Device, Index >& matrix, - const Index row ) - { - return ( row + 1 ) * matrix.rowLengths; - } - - template< typename Real, - typename Index > - static Index getElementStep( const EllpackSymmetricGraph< Real, Device, Index >& matrix ) - { - return 1; - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector > - static void vectorProduct( const EllpackSymmetricGraph< Real, Device, Index >& matrix, - const InVector& inVector, - OutVector& outVector ) - { - matrix.vectorProductHost( inVector, outVector ); - } -}; - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index > -template< typename InVector, - typename OutVector > -__cuda_callable__ -void EllpackSymmetricGraph< Real, Device, Index >::spmvCuda( const InVector& inVector, - OutVector& outVector, - const int globalIdx, - const int color ) const -{ - IndexType offset = this->colorPointers[ color ]; - const IndexType colorEnd = this->colorPointers[ color + 1 ]; - IndexType row = offset + globalIdx; - if( row >= colorEnd ) - return; - - IndexType i = DeviceDependentCode::getRowBegin( *this, row ); - const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row ); - const IndexType step = DeviceDependentCode::getElementStep( *this ); - const IndexType rowMapping = this->inversePermutationArray[ row ]; - - while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() ) - { - const IndexType column = this->columnIndexes[ i ]; - outVector[ rowMapping ] += this->values[ i ] * inVector[ column ]; - if( rowMapping != column ) - outVector[ column ] += this->values[ i ] * inVector[ rowMapping ]; - i += step; - } -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename InVector, - typename OutVector > -__global__ -void EllpackSymmetricGraphVectorProductCuda( const EllpackSymmetricGraph< Real, Devices::Cuda, Index >* matrix, - const InVector* inVector, - OutVector* outVector, - const int gridIdx, - const int color ) -{ - int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - matrix->spmvCuda( *inVector, *outVector, globalIdx, color ); -} -#endif - -template<> -class EllpackSymmetricGraphDeviceDependentCode< Devices::Cuda > -{ - public: - - typedef Devices::Cuda Device; - - template< typename Real, - typename Index > - __cuda_callable__ - static Index getRowBegin( const EllpackSymmetricGraph< Real, Device, Index >& matrix, - const Index row ) - { - return row; - } - - template< typename Real, - typename Index > - __cuda_callable__ - static Index getRowEnd( const EllpackSymmetricGraph< Real, Device, Index >& matrix, - const Index row ) - { - return row + getElementStep( matrix ) * matrix.rowLengths; - } - - template< typename Real, - typename Index > - __cuda_callable__ - static Index getElementStep( const EllpackSymmetricGraph< Real, Device, Index >& matrix ) - { - return matrix.alignedRows; - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector > - static void vectorProduct( const EllpackSymmetricGraph< Real, Device, Index >& matrix, - const InVector& inVector, - OutVector& outVector ) - { -#ifdef HAVE_CUDA - typedef EllpackSymmetricGraph< Real, Devices::Cuda, Index > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - for( IndexType color = 0; color < matrix.getNumberOfColors(); color++ ) - { - IndexType rows = matrix.getRowsOfColor( color ); - const IndexType cudaBlocks = roundUpDivision( rows, cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - EllpackSymmetricGraphVectorProductCuda< Real, Index, InVector, OutVector > - <<< cudaGridSize, cudaBlockSize >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx, - color ); - } - } - - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_inVector ); - Cuda::freeFromDevice( kernel_outVector ); - TNL_CHECK_CUDA_DEVICE; -#endif - } -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h b/src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h deleted file mode 100644 index 8bf42b79d..000000000 --- a/src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h +++ /dev/null @@ -1,833 +0,0 @@ -/*************************************************************************** - EllpackSymmetric_impl.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Real, - typename Device, - typename Index > -EllpackSymmetric< Real, Device, Index > :: EllpackSymmetric() -: rowLengths( 0 ), alignedRows( 0 ) -{ -}; - -template< typename Real, - typename Device, - typename Index > -String EllpackSymmetric< Real, Device, Index > :: getType() -{ - return String( "Matrices::EllpackSymmetric< ") + - String( TNL::getType< Real >() ) + - String( ", " ) + - String( Device::getDeviceType() ) + - String( ", " ) + - String( TNL::getType< Index >() ) + - String( " >" ); -} - -template< typename Real, - typename Device, - typename Index > -String EllpackSymmetric< Real, Device, Index >::getTypeVirtual() const -{ - return this->getType(); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::setDimensions( const IndexType rows, - const IndexType columns ) -{ - TNL_ASSERT( rows > 0 && columns > 0, - std::cerr << "rows = " << rows - << " columns = " << columns <rows = rows; - this->columns = columns; - - if( std::is_same< DeviceType, Devices::Cuda >::value ) - { - this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() ); - - if( this->rows - this->alignedRows > 0 ) - { - IndexType missingRows = this->rows - this->alignedRows; - missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() ); - this->alignedRows += missingRows; - -// this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() ); - } - } - else this->alignedRows = rows; - - if( this->rowLengths != 0 ) - allocateElements(); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) -{ - TNL_ASSERT( this->getRows() > 0, ); - TNL_ASSERT( this->getColumns() > 0, ); - //TNL_ASSERT( this->rowLengths > 0, - // std::cerr << "this->rowLengths = " << this->rowLengths ); - this->rowLengths = this->maxRowLength = max( rowLengths ); - allocateElements(); -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetric< Real, Device, Index >::setConstantRowLengths( const IndexType& rowLengths ) -{ - TNL_ASSERT( rowLengths > 0, - std::cerr << " rowLengths = " << rowLengths ); - this->rowLengths = rowLengths; - if( this->rows > 0 ) - allocateElements(); - return true; -} - -template< typename Real, - typename Device, - typename Index > -Index EllpackSymmetric< Real, Device, Index >::getRowLength( const IndexType row ) const -{ - return this->rowLengths; -} - -template< typename Real, - typename Device, - typename Index > - template< typename Real2, - typename Device2, - typename Index2 > -bool EllpackSymmetric< Real, Device, Index >::setLike( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) -{ - if( ! Sparse< Real, Device, Index >::setLike( matrix ) ) - return false; - this->rowLengths = matrix.rowLengths; - return true; -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index > :: reset() -{ - Sparse< Real, Device, Index >::reset(); - this->rowLengths = 0; -} - -template< typename Real, - typename Device, - typename Index > - template< typename Real2, - typename Device2, - typename Index2 > -bool EllpackSymmetric< Real, Device, Index >::operator == ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const -{ - TNL_ASSERT( this->getRows() == matrix.getRows() && - this->getColumns() == matrix.getColumns(), - std::cerr << "this->getRows() = " << this->getRows() - << " matrix.getRows() = " << matrix.getRows() - << " this->getColumns() = " << this->getColumns() - << " matrix.getColumns() = " << matrix.getColumns() - << " this->getName() = " << this->getName() - << " matrix.getName() = " << matrix.getName() ); - // TODO: implement this - throw Exceptions::NotImplementedError( "EllpackSymmetric::operator== is not implemented." ); -} - -template< typename Real, - typename Device, - typename Index > - template< typename Real2, - typename Device2, - typename Index2 > -bool EllpackSymmetric< Real, Device, Index >::operator != ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const -{ - return ! ( ( *this ) == matrix ); -} - -/*template< typename Real, - typename Device, - typename Index > - template< typename Matrix > -bool EllpackSymmetric< Real, Device, Index >::copyFrom( const Matrix& matrix, - const CompressedRowLengthsVector& rowLengths ) -{ - return tnlMatrix< RealType, DeviceType, IndexType >::copyFrom( matrix, rowLengths ); -}*/ - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetric< Real, Device, Index > :: setElementFast( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElementFast( row, column, value, 0.0 ); -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetric< Real, Device, Index > :: setElement( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElement( row, column, value, 0.0 ); -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetric< Real, Device, Index > :: addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - // TODO: return this back when CUDA kernels supportstd::cerr - /*TNL_ASSERT( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, - std::cerr << " row = " << row - << " column = " << column - << " this->rows = " << this->rows - << " this->columns = " << this-> columns );*/ - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType i = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - while( i < rowEnd && - this->columnIndexes[ i ] < column && - this->columnIndexes[ i ] != this->getPaddingIndex() ) i += step; - if( i == rowEnd ) - return false; - if( this->columnIndexes[ i ] == column ) - { - this->values[ i ] = thisElementMultiplicator * this->values[ i ] + value; - return true; - } - else - if( this->columnIndexes[ i ] == this->getPaddingIndex() ) // artificial zero - { - this->columnIndexes[ i ] = column; - this->values[ i ] = value; - } - else - { - Index j = rowEnd - step; - while( j > i ) - { - this->columnIndexes[ j ] = this->columnIndexes[ j - step ]; - this->values[ j ] = this->values[ j - step ]; - j -= step; - } - this->columnIndexes[ i ] = column; - this->values[ i ] = value; - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetric< Real, Device, Index > :: addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType i = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - while( i < rowEnd && - this->columnIndexes.getElement( i ) < column && - this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) i += step; - if( i == rowEnd ) - return false; - if( this->columnIndexes.getElement( i ) == column ) - { - this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value ); - return true; - } - else - if( this->columnIndexes.getElement( i ) == this->getPaddingIndex() ) - { - this->columnIndexes.setElement( i, column ); - this->values.setElement( i, value ); - } - else - { - IndexType j = rowEnd - step; - while( j > i ) - { - this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) ); - this->values.setElement( j, this->values.getElement( j - step ) ); - j -= step; - } - this->columnIndexes.setElement( i, column ); - this->values.setElement( i, value ); - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetric< Real, Device, Index > :: setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType elementPointer = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - if( elements > this->rowLengths ) - return false; - for( Index i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes[ elementPointer ] = column; - this->values[ elementPointer ] = values[ i ]; - elementPointer += step; - } - for( Index i = elements; i < this->rowLengths; i++ ) - { - this->columnIndexes[ elementPointer ] = this->getPaddingIndex(); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetric< Real, Device, Index > :: setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType elementPointer = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - if( elements > this->rowLengths ) - return false; - - for( IndexType i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes.setElement( elementPointer, column ); - this->values.setElement( elementPointer, values[ i ] ); - elementPointer += step; - } - for( IndexType i = elements; i < this->rowLengths; i++ ) - { - this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() ); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetric< Real, Device, Index > :: addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - // TODO: implement - return false; -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetric< Real, Device, Index > :: addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - return this->addRowFast( row, columns, values, numberOfElements ); -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Real EllpackSymmetric< Real, Device, Index >::getElementFast( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElementFast( column, row ); - - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - while( elementPtr < rowEnd && - this->columnIndexes[ elementPtr ] < column && - this->columnIndexes[ elementPtr ] != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr < rowEnd && this->columnIndexes[ elementPtr ] == column ) - return this->values[ elementPtr ]; - return 0.0; -} - -template< typename Real, - typename Device, - typename Index > -Real EllpackSymmetric< Real, Device, Index >::getElement( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElement( column, row ); - - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - while( elementPtr < rowEnd && - this->columnIndexes.getElement( elementPtr ) < column && - this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column ) - return this->values.getElement( elementPtr ); - return 0.0; -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -void EllpackSymmetric< Real, Device, Index >::getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - for( IndexType i = 0; i < this->rowLengths; i++ ) - { - columns[ i ] = this->columnIndexes[ elementPtr ]; - values[ i ] = this->values[ elementPtr ]; - elementPtr += step; - } -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::getRow( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - for( IndexType i = 0; i < this->rowLengths; i++ ) - { - columns[ i ] = this->columnIndexes.getElement( elementPtr ); - values[ i ] = this->values.getElement( elementPtr ); - elementPtr += step; - } -} - - - -template< typename Real, - typename Device, - typename Index > - template< typename InVector, - typename OutVector > -void EllpackSymmetric< Real, Device, Index >::vectorProduct( const InVector& inVector, - OutVector& outVector ) const -{ - DeviceDependentCode::vectorProduct( *this, inVector, outVector ); -} - -template< typename Real, - typename Device, - typename Index > - template< typename Real2, - typename Index2 > -void EllpackSymmetric< Real, Device, Index > :: addMatrix( const EllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator, - const RealType& thisMatrixMultiplicator ) -{ - throw Exceptions::NotImplementedError( "EllpackSymmetric::addMatrix is not implemented." ); - // TODO: implement -} - -template< typename Real, - typename Device, - typename Index > - template< typename Real2, - typename Index2 > -void EllpackSymmetric< Real, Device, Index >::getTransposition( const EllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator ) -{ - throw Exceptions::NotImplementedError( "EllpackSymmetric::getTransposition is not implemented." ); - // TODO: implement -} - -template< typename Real, - typename Device, - typename Index > - template< typename Vector > -bool EllpackSymmetric< Real, Device, Index > :: performSORIteration( const Vector& b, - const IndexType row, - Vector& x, - const RealType& omega ) const -{ - TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << this->getName() <rowLengths ); - const IndexType rowEnd( i + this->rowLengths ); - IndexType column; - while( i < rowEnd && ( column = this->columnIndexes[ i ] ) < this->columns ) - { - if( column == row ) - diagonalValue = this->values.getElement( i ); - else - sum += this->values.getElement( row * this->diagonalsShift.getSize() + i ) * x. getElement( column ); - i++; - } - if( diagonalValue == ( Real ) 0.0 ) - { - std::cerr << "There is zero on the diagonal in " << row << "-th row of thge matrix " << this->getName() << ". I cannot perform SOR iteration." < -void EllpackSymmetric< Real, Device, Index >::save( File& file ) const -{ - Sparse< Real, Device, Index >::save( file); - file.save( &this->rowLengths ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::load( File& file ) -{ - Sparse< Real, Device, Index >::load( file); - file.load( &this->rowLengths ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::save( const String& fileName ) const -{ - Object::save( fileName ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::load( const String& fileName ) -{ - Object::load( fileName ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::print( std::ostream& str ) const -{ - for( IndexType row = 0; row < this->getRows(); row++ ) - { - str <<"Row: " << row << " -> "; - IndexType i( row * this->rowLengths ); - const IndexType rowEnd( i + this->rowLengths ); - while( i < rowEnd && - this->columnIndexes.getElement( i ) < this->columns && - this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) - { - const Index column = this->columnIndexes.getElement( i ); - str << " Col:" << column << "->" << this->values.getElement( i ) << "\t"; - i++; - } - str < -void EllpackSymmetric< Real, Device, Index >::allocateElements() -{ - IndexType numberOfMatrixElements = this->alignedRows * this->rowLengths; - - TNL_ASSERT_TRUE( this->alignedRows != 0 && numberOfMatrixElements / this->alignedRows == this->rowLengths, - "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" ); - - Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths ); -} - -template<> -class EllpackSymmetricDeviceDependentCode< Devices::Host > -{ - public: - - typedef Devices::Host Device; - - template< typename Real, - typename Index > - static Index getRowBegin( const EllpackSymmetric< Real, Device, Index >& matrix, - const Index row ) - { - return row * matrix.rowLengths; - } - - template< typename Real, - typename Index > - static Index getRowEnd( const EllpackSymmetric< Real, Device, Index >& matrix, - const Index row ) - { - //return row * matrix.rowLengths + row + 1; - return min(row * matrix.rowLengths + row + 1, ( row + 1 ) * matrix.rowLengths ); - } - - template< typename Real, - typename Index > - static Index getElementStep( const EllpackSymmetric< Real, Device, Index >& matrix ) - { - return 1; - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector > - static void vectorProduct( const EllpackSymmetric< Real, Device, Index >& matrix, - const InVector& inVector, - OutVector& outVector ) - { - matrix.vectorProductHost( inVector, outVector ); - } - -}; - -template< typename Real, - typename Device, - typename Index > -template< typename InVector, - typename OutVector > -void EllpackSymmetric< Real, Device, Index >::vectorProductHost( const InVector& inVector, - OutVector& outVector ) const -{ - for( Index row = 0; row < this->getRows(); row++ ) - { - IndexType i = DeviceDependentCode::getRowBegin( *this, row ); - const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row ); - const IndexType step = DeviceDependentCode::getElementStep( *this ); - - while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() ) - { - const IndexType column = this->columnIndexes[ i ]; - outVector[ row ] += this->values[ i ] * inVector[ column ]; - if( row != column ) - outVector[ column ] += this->values[ i ] * inVector[ row ]; - i += step; - } - } -}; - -template< typename Real, - typename Device, - typename Index > -template< typename Vector > -__cuda_callable__ -typename Vector::RealType EllpackSymmetric< Real, Device, Index >::rowVectorProduct( const IndexType row, - const Vector& vector ) const -{ - IndexType i = DeviceDependentCode::getRowBegin( *this, row ); - const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row ); - const IndexType step = DeviceDependentCode::getElementStep( *this ); - - Real result = 0.0; - while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() ) - { - const Index column = this->columnIndexes[ i ]; - result += this->values[ i ] * vector[ column ]; - i += step; - } - return result; -} - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index > -template< typename InVector, - typename OutVector > -__cuda_callable__ -void EllpackSymmetric< Real, Device, Index >::spmvCuda( const InVector& inVector, - OutVector& outVector, - int rowId ) const -{ - IndexType i = DeviceDependentCode::getRowBegin( *this, rowId ); - const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, rowId ); - const IndexType step = DeviceDependentCode::getElementStep( *this ); - - while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() ) - { - const IndexType column = this->columnIndexes[ i ]; - outVector[ rowId ] += this->values[ i ] * inVector[ column ]; - if( rowId != column ) - outVector[ column ] += this->values[ i ] * inVector[ rowId ]; - i += step; - } -}; -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename InVector, - typename OutVector > -__global__ -void EllpackSymmetricVectorProductCuda( const EllpackSymmetric< Real, Devices::Cuda, Index >* matrix, - const InVector* inVector, - OutVector* outVector, - const int gridIdx ) -{ - int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( globalIdx >= matrix->getRows() ) - return; - matrix->spmvCuda( *inVector, *outVector, globalIdx ); -}; -#endif - -template<> -class EllpackSymmetricDeviceDependentCode< Devices::Cuda > -{ - public: - - typedef Devices::Cuda Device; - - template< typename Real, - typename Index > - __cuda_callable__ - static Index getRowBegin( const EllpackSymmetric< Real, Device, Index >& matrix, - const Index row ) - { - return row; - } - - template< typename Real, - typename Index > - __cuda_callable__ - static Index getRowEnd( const EllpackSymmetric< Real, Device, Index >& matrix, - const Index row ) - { - // TODO: fix this: return row + getElementStep( matrix ) * matrix.rowLengths; - return min( row + getElementStep( matrix ) * matrix.rowLengths, row + ( row + 1 ) * getElementStep( matrix ) ); - } - - template< typename Real, - typename Index > - __cuda_callable__ - static Index getElementStep( const EllpackSymmetric< Real, Device, Index >& matrix ) - { - return matrix.alignedRows; - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector > - static void vectorProduct( const EllpackSymmetric< Real, Device, Index >& matrix, - const InVector& inVector, - OutVector& outVector ) - { -#ifdef HAVE_CUDA - typedef EllpackSymmetric< Real, Devices::Cuda, Index > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - const int sharedMemory = cudaBlockSize.x * sizeof( Real ); - EllpackSymmetricVectorProductCuda< Real, Index, InVector, OutVector > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx ); - } - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_inVector ); - Cuda::freeFromDevice( kernel_outVector ); - TNL_CHECK_CUDA_DEVICE; -#endif - } -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h deleted file mode 100644 index 99ac3562e..000000000 --- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h +++ /dev/null @@ -1,210 +0,0 @@ -/*************************************************************************** - SlocedEllpackSymmetric.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Device > -class SlicedEllpackSymmetricDeviceDependentCode; - -template< typename Real = double, - typename Device = Devices::Host, - typename Index = int, - int SliceSize = 32 > -class SlicedEllpackSymmetric; - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int SliceSize > -__global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix, - typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths, - int gridIdx ); -#endif - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -class SlicedEllpackSymmetric : public Sparse< Real, Device, Index > -{ - public: - - typedef Real RealType; - typedef Device DeviceType; - typedef Index IndexType; - typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView; - typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; - - template< typename _Real = Real, - typename _Device = Device, - typename _Index = Index, - int _SliceSize = SliceSize > - using Self = SlicedEllpackSymmetric< _Real, _Device, _Index, _SliceSize >; - - SlicedEllpackSymmetric(); - - void setDimensions( const IndexType rows, - const IndexType columns ); - - void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ); - - IndexType getRowLength( const IndexType row ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool setLike( const SlicedEllpackSymmetric< Real2, Device2, Index2, SliceSize >& matrix ); - - void reset(); - - template< typename Real2, typename Device2, typename Index2 > - bool operator == ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool operator != ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const; - - __cuda_callable__ - bool setElementFast( const IndexType row, - const IndexType column, - const RealType& value ); - - bool setElement( const IndexType row, - const IndexType column, - const RealType& value ); - - __cuda_callable__ - bool addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - bool setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - bool setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - __cuda_callable__ - bool addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - RealType getElementFast( const IndexType row, - const IndexType column ) const; - - RealType getElement( const IndexType row, - const IndexType column ) const; - - - __cuda_callable__ - void getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const; - - void getRow( const IndexType row, - IndexType* columns, - RealType* values ) const; - - template< typename InVector, - typename OutVector > - __cuda_callable__ - void rowVectorProduct( const IndexType row, - const InVector& inVector, - OutVector& outVector ) const; - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const; - - template< typename InVector, - typename OutVector > - __cuda_callable__ - void spmvCuda( const InVector& inVector, - OutVector& outVector, - int globalIdx ) const; - - template< typename Real2, typename Index2 > - void addMatrix( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator = 1.0, - const RealType& thisMatrixMultiplicator = 1.0 ); - - template< typename Real2, typename Index2 > - void getTransposition( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator = 1.0 ); - - template< typename Vector > - bool performSORIteration( const Vector& b, - const IndexType row, - Vector& x, - const RealType& omega = 1.0 ) const; - - void save( File& file ) const; - - void load( File& file ); - - void save( const String& fileName ) const; - - void load( const String& fileName ); - - void print( std::ostream& str ) const; - - protected: - - Containers::Vector< Index, Device, Index > slicePointers, sliceRowLengths; - - typedef SlicedEllpackSymmetricDeviceDependentCode< DeviceType > DeviceDependentCode; - friend class SlicedEllpackSymmetricDeviceDependentCode< DeviceType >; -#ifdef HAVE_CUDA - /*friend __global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize >( SlicedEllpackMatrix< Real, Devices::Cuda, Index, SliceSize >* matrix, - const typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::RowLengthsVector* rowLengths, - int gridIdx ); - */ - // TODO: The friend declaration above does not work because of __global__ storage specifier. Therefore we declare the following method as public. Fix this, when possible. - - public: - __device__ void computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths, - const IndexType sliceIdx ); - -#endif - -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL - -#include diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h deleted file mode 100644 index b7ee87235..000000000 --- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h +++ /dev/null @@ -1,242 +0,0 @@ -/*************************************************************************** - SlicedEllpackSymmetricGraph.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Device > -class SlicedEllpackSymmetricGraphDeviceDependentCode; - -template< typename Real = double, - typename Device = Devices::Host, - typename Index = int, - int SliceSize = 32 > -class SlicedEllpackSymmetricGraph; - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int SliceSize > -__global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >* matrix, - typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths, - int gridIdx ); -#endif - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -class SlicedEllpackSymmetricGraph : public Sparse< Real, Device, Index > -{ - public: - - typedef Real RealType; - typedef Device DeviceType; - typedef Index IndexType; - typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView; - typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; - - template< typename _Real = Real, - typename _Device = Device, - typename _Index = Index, - int _SliceSize = SliceSize > - using Self = SlicedEllpackSymmetricGraph< _Real, _Device, _Index, _SliceSize >; - - SlicedEllpackSymmetricGraph(); - - void setDimensions( const IndexType rows, - const IndexType columns ); - - void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ); - - IndexType getRowLength( const IndexType row ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool setLike( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2, SliceSize >& matrix ); - - void reset(); - - template< typename Real2, typename Device2, typename Index2 > - bool operator == ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool operator != ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const; - - template< typename InVector, - typename OutVector > - void vectorProductHost( const InVector& inVector, OutVector& outVector ) const; - - __cuda_callable__ - bool setElementFast( const IndexType row, - const IndexType column, - const RealType& value ); - - bool setElement( const IndexType row, - const IndexType column, - const RealType& value ); - - __cuda_callable__ - bool addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - bool setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - bool setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - __cuda_callable__ - bool addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - RealType getElementFast( const IndexType row, - const IndexType column ) const; - - RealType getElement( const IndexType row, - const IndexType column ) const; - - __cuda_callable__ - void getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const; - - void getRow( const IndexType row, - IndexType* columns, - RealType* values ) const; - - template< typename Vector > - __cuda_callable__ - typename Vector::RealType rowVectorProduct( const IndexType row, - const Vector& vector ) const; - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const; - - template< typename Real2, typename Index2 > - void addMatrix( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator = 1.0, - const RealType& thisMatrixMultiplicator = 1.0 ); - - template< typename Real2, typename Index2 > - void getTransposition( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator = 1.0 ); - - template< typename Vector > - bool performSORIteration( const Vector& b, - const IndexType row, - Vector& x, - const RealType& omega = 1.0 ) const; - - Index getRealRowLength( const Index row ); - - Containers::Vector< Index, Device, Index > getRealRowLengths(); - - void save( File& file ) const; - - void load( File& file ); - - void save( const String& fileName ) const; - - void load( const String& fileName ); - - void print( std::ostream& str ) const; - - bool help( bool verbose = false ); - -#ifdef HAVE_CUDA - template< typename InVector, - typename OutVector > - __device__ - void spmvCuda( const InVector& inVector, - OutVector& outVector, - const int globalIdx, - const int color ) const; -#endif - - void copyFromHostToCuda( SlicedEllpackSymmetricGraph< Real, Devices::Host, Index, SliceSize >& matrix ); - - bool rearrangeMatrix( bool verbose = false ); - - void computePermutationArray(); - - Containers::Vector< Index, Device, Index > getSlicePointers(); - - Containers::Vector< Index, Device, Index > getSliceRowLengths(); - - Containers::Vector< Index, Device, Index > getPermutationArray(); - - Containers::Vector< Index, Device, Index > getInversePermutationArray(); - - Containers::Vector< Index, Device, Index > getColorPointers(); - - protected: - - Containers::Vector< Index, Device, Index > slicePointers, sliceRowLengths; - - typedef SlicedEllpackSymmetricGraphDeviceDependentCode< DeviceType > DeviceDependentCode; - friend class SlicedEllpackSymmetricGraphDeviceDependentCode< DeviceType >; - - Containers::Vector< Index, Device, Index > permutationArray; - Containers::Vector< Index, Device, Index > inversePermutationArray; - Containers::Vector< Index, Device, Index > colorPointers; - bool rearranged; -#ifdef HAVE_CUDA - /*friend __global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize >( SlicedEllpackMatrix< Real, Devices::Cuda, Index, SliceSize >* matrix, - const typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::RowLengthsVector* rowLengths, - int gridIdx ); - */ - // TODO: The friend declaration above does not work because of __global__ storage specifier. Therefore we declare the following method as public. Fix this, when possible. - - public: - __device__ void computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths, - const IndexType sliceIdx ); - -#endif - -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL - -#include - diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h deleted file mode 100644 index 5ab2f77c1..000000000 --- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h +++ /dev/null @@ -1,1316 +0,0 @@ -/*************************************************************************** - SlicedEllpackSymmetricGraph_impl.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::SlicedEllpackSymmetricGraph() -: rearranged( false ) -{ -}; - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -String SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getType() -{ - return String( "Matrices::SlicedEllpackSymmetricGraph< ") + - String( TNL::getType< Real >() ) + - String( ", " ) + - String( Device::getDeviceType() ) + - String( ", " ) + - String( TNL::getType< Index >() ) + - String( " >" ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -String SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getTypeVirtual() const -{ - return this->getType(); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setDimensions( const IndexType rows, - const IndexType columns ) -{ - TNL_ASSERT( rows > 0 && columns > 0, - std::cerr << "rows = " << rows - << " columns = " << columns <::setDimensions( rows, columns ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) -{ - TNL_ASSERT( this->getRows() > 0, ); - TNL_ASSERT( this->getColumns() > 0, ); - const IndexType slices = roundUpDivision( this->rows, SliceSize ); - this->sliceRowLengths.setSize( slices ); - this->slicePointers.setSize( slices + 1 ); - - this->permutationArray.setSize( this->getRows() ); - for( IndexType i = 0; i < this->getRows(); i++ ) - this->permutationArray.setElement( i, i ); - - Containers::Vector< Index, Device, Index > sliceRowLengths, slicePointers; - sliceRowLengths.setSize( slices ); - slicePointers.setSize( slices + 1 ); - // TODO: fix this - //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths, sliceRowLengths, slicePointers ); - this->sliceRowLengths = sliceRowLengths; - this->slicePointers = slicePointers; - - this->maxRowLength = rowLengths.max(); - - this->slicePointers.computeExclusivePrefixSum(); - this->allocateMatrixElements( this->slicePointers.getElement( slices ) ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Index SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRowLength( const IndexType row ) const -{ - const IndexType slice = row / SliceSize; - return this->sliceRowLengths[ slice ]; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setLike( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2, SliceSize >& matrix ) -{ - if( !Sparse< Real, Device, Index >::setLike( matrix ) || - ! this->slicePointers.setLike( matrix.slicePointers ) || - ! this->sliceRowLengths.setLike( matrix.sliceRowLengths ) ) - return false; - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::reset() -{ - Sparse< Real, Device, Index >::reset(); - this->slicePointers.reset(); - this->sliceRowLengths.reset(); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::operator == ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const -{ - TNL_ASSERT( this->getRows() == matrix.getRows() && - this->getColumns() == matrix.getColumns(), - std::cerr << "this->getRows() = " << this->getRows() - << " matrix.getRows() = " << matrix.getRows() - << " this->getColumns() = " << this->getColumns() - << " matrix.getColumns() = " << matrix.getColumns() - << " this->getName() = " << this->getName() - << " matrix.getName() = " << matrix.getName() ); - // TODO: implement this - throw Exceptions::NotImplementedError( "SlicedEllpackSymmetricGraph::operator== is not implemented." ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::operator != ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const -{ - return ! ( ( *this ) == matrix ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setElementFast( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElementFast( row, column, value, 0.0 ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setElement( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElement( row, column, value, 0.0 ); -} - - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - TNL_ASSERT( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, - std::cerr << " row = " << row - << " column = " << column - << " this->rows = " << this->rows - << " this->columns = " << this-> columns ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes.getElement( elementPtr ) ) < column && - col != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr == rowEnd ) - return false; - if( col == column ) - { - this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value ); - return true; - } - if( col == this->getPaddingIndex() ) - { - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; - } - IndexType j = rowEnd - step; - while( j > elementPtr ) - { - this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) ); - this->values.setElement( j, this->values.getElement( j - step ) ); - j -= step; - } - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - TNL_ASSERT( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, - std::cerr << " row = " << row - << " column = " << column - << " this->rows = " << this->rows - << " this->columns = " << this-> columns ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes.getElement( elementPtr ) ) < column && - col != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr == rowEnd ) - return false; - if( col == column ) - { - this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value ); - return true; - } - if( col == this->getPaddingIndex() ) - { - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; - } - IndexType j = rowEnd - step; - while( j > elementPtr ) - { - this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) ); - this->values.setElement( j, this->values.getElement( j - step ) ); - j -= step; - } - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize; - const IndexType rowLength = this->sliceRowLengths[ sliceIdx ]; - if( elements > rowLength ) - return false; - - Index elementPointer, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, this->permutationArray.getElement( row ), elementPointer, rowEnd, step ); - - for( IndexType i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes[ elementPointer ] = columnIndexes[ i ]; - this->values[ elementPointer ] = values[ i ]; - elementPointer += step; - } - for( IndexType i = elements; i < rowLength; i++ ) - { - this->columnIndexes[ elementPointer ] = this->getPaddingIndex(); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize; - const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx ); - if( elements > rowLength ) - return false; - - Index elementPointer, rowEnd, step; - DeviceDependentCode::initRowTraverse( *this, this->permutationArray.getElement( row ), elementPointer, rowEnd, step ); - - for( IndexType i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes.setElement( elementPointer, column ); - this->values.setElement( elementPointer, values[ i ] ); - elementPointer += step; - } - for( IndexType i = elements; i < rowLength; i++ ) - { - this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() ); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - // TODO: implement - return false; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - // TODO: implement - return false; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -Real SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getElementFast( const IndexType row, - const IndexType column ) const -{ - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes[ elementPtr ] ) < column && - col != this->getPaddingIndex() ) - elementPtr += step; - if( elementPtr < rowEnd && col == column ) - return this->values[ elementPtr ]; - return 0.0; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Real SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getElement( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElement( column, row ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes.getElement( elementPtr ) ) < column && - col != this->getPaddingIndex() ) - elementPtr += step; - if( elementPtr < rowEnd && col == column ) - return this->values.getElement( elementPtr ); - return 0.0; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - Index elementPtr, rowEnd, step, i( 0 ); - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - while( elementPtr < rowEnd ) - { - columns[ i ] = this->columnIndexes[ elementPtr ]; - values[ i ] = this->values[ elementPtr ]; - elementPtr += step; - i++; - } -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRow( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - Index elementPtr, rowEnd, step, i( 0 ); - DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); - - while( elementPtr < rowEnd ) - { - columns[ i ] = this->columnIndexes.getElement( elementPtr ); - values[ i ] = this->values.getElement( elementPtr ); - elementPtr += step; - i++; - } -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Vector > -__cuda_callable__ -typename Vector::RealType SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::rowVectorProduct( const IndexType row, - const Vector& vector ) const -{ - Real result = 0.0; - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - IndexType column; - while( elementPtr < rowEnd && - ( column = this->columnIndexes[ elementPtr ] ) < this->columns && - column != this->getPaddingIndex() ) - { - result += this->values[ elementPtr ] * vector[ column ]; - elementPtr += step; - } - return result; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename InVector, - typename OutVector > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::vectorProduct( const InVector& inVector, - OutVector& outVector ) const -{ - DeviceDependentCode::vectorProduct( *this, inVector, outVector ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Index2 > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::addMatrix( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator, - const RealType& thisMatrixMultiplicator ) -{ - throw Exceptions::NotImplementedError( "SlicedEllpackSymmetricGraph::addMatrix is not implemented." ); - // TODO: implement -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Index2 > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getTransposition( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator ) -{ - throw Exceptions::NotImplementedError( "SlicedEllpackSymmetricGraph::getTransposition is not implemented." ); - // TODO: implement -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Vector > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::performSORIteration( const Vector& b, - const IndexType row, - Vector& x, - const RealType& omega ) const -{ - TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << this->getName() <permutationArray.getElement( row ) / SliceSize; - const IndexType rowLength = this->sliceRowLengths[ sliceIdx ]; - IndexType elementPtr = this->slicePointers[ sliceIdx ] + - rowLength * ( this->permutationArray.getElement( row ) - sliceIdx * SliceSize ); - const IndexType rowEnd( elementPtr + rowLength ); - IndexType column; - while( elementPtr < rowEnd && ( column = this->columnIndexes[ elementPtr ] ) < this->columns ) - { - if( column == this->permutationArray.getElement( row ) ) - diagonalValue = this->values.getElement( elementPtr ); - else - sum += this->values.getElement( this->permutationArray.getElement( row ) * this->diagonalsShift.getSize() + elementPtr ) * x. getElement( column ); - elementPtr++; - } - if( diagonalValue == ( Real ) 0.0 ) - { - std::cerr << "There is zero on the diagonal in " << this->permutationArray.getElement( row ) << "-th row of thge matrix " << this->getName() << ". I cannot perform SOR iteration." <permutationArray.getElement( row ), x[ this->permutationArray.getElement( row ) ] + omega / diagonalValue * ( b[ this->permutationArray.getElement( row ) ] - sum ) ); - return true; -} - - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::save( File& file ) const -{ - Sparse< Real, Device, Index >::save( file ); - file << this->slicePointers << this->sliceRowLengths; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::load( File& file ) -{ - Sparse< Real, Device, Index >::load( file ); - file >> this->slicePointers >> this->sliceRowLengths; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::save( const String& fileName ) const -{ - Object::save( fileName ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::load( const String& fileName ) -{ - Object::load( fileName ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::print( std::ostream& str ) const -{ - for( IndexType row = 0; row < this->getRows(); row++ ) - { - str <<"Row: " << row << " -> "; - const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize; - const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx ); - IndexType elementPtr = this->slicePointers.getElement( sliceIdx ) + - rowLength * ( this->permutationArray.getElement( row ) - sliceIdx * SliceSize ); - const IndexType rowEnd( elementPtr + rowLength ); - while( elementPtr < rowEnd && - this->columnIndexes.getElement( elementPtr ) < this->columns && - this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) - { - const Index column = this->columnIndexes.getElement( elementPtr ); - str << " Col:" << column << "->" << this->values.getElement( elementPtr ) << "\t"; - elementPtr++; - } - str < -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::computePermutationArray() -{ - Containers::Vector< Index, Device, Index > colorsVector; - colorsVector.setSize( this->getRows() ); - for( IndexType i = 0; i < this->getRows(); i++ ) - { - colorsVector.setElement( i, 0 ); - } - - // compute colors for each row - Matrix< Real, Device, Index >::computeColorsVector( colorsVector ); - - // init color pointers - this->colorPointers.setSize( this->getNumberOfColors() + 1 ); - - // compute permutation - IndexType position = 0; - for( IndexType color = 0; color < this->getNumberOfColors(); color++ ) - { - this->colorPointers.setElement( color, position ); - for (IndexType i = 0; i < this->getRows(); i++) - if ( colorsVector.getElement( i ) == color) - { - IndexType row1 = this->permutationArray.getElement( i ); - IndexType row2 = this->permutationArray.getElement( position ); - IndexType tmp = this->permutationArray.getElement( row1 ); - this->permutationArray.setElement( row1, this->permutationArray.getElement( row2 ) ); - this->permutationArray.setElement( row2, tmp ); - - tmp = colorsVector.getElement( position ); - colorsVector.setElement( position, colorsVector.getElement( i ) ); - colorsVector.setElement( i, tmp ); - position++; - } - } - - this->colorPointers.setElement( this->getNumberOfColors(), this->getRows() ); - - this->inversePermutationArray.setSize( this->getRows() ); - for( IndexType i = 0; i < this->getRows(); i++ ) - this->inversePermutationArray.setElement( this->permutationArray.getElement( i ), i ); - - // destroy colors vector - colorsVector.reset(); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Index SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRealRowLength( const Index row ) -{ - const Index sliceIdx = row / SliceSize; - const Index slicePointer = this->slicePointers.getElement( sliceIdx ); - const Index rowLength = this->sliceRowLengths.getElement( sliceIdx ); - - Index rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize ); - Index rowEnd = rowBegin + rowLength; - Index length = 0; - for( Index i = rowBegin; i < rowEnd; i++ ) - if( this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) - length++; - else - break; - - return length; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRealRowLengths() -{ - Containers::Vector< Index, Device, Index > rowLengths; - rowLengths.setSize( this->getRows() ); - for( IndexType row = 0; row < this->getRows(); row++ ) - rowLengths.setElement( row, this->getRealRowLength( row ) ); - - return rowLengths; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::rearrangeMatrix( bool verbose ) -{ - this->computePermutationArray(); - - // now based on new permutation array we need to recompute row lengths in slices - const IndexType slices = roundUpDivision( this->rows, SliceSize ); - Containers::Vector< Index, Device, Index > sliceRowLengths, slicePointers, rowLengths; - sliceRowLengths.setSize( slices ); - slicePointers.setSize( slices + 1 ); - rowLengths.setSize( this->getRows() ); - rowLengths = this->getRealRowLengths(); - // TODO: fix this - //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths, sliceRowLengths, slicePointers ); - - slicePointers.computeExclusivePrefixSum(); - - // this->testRowLengths( rowLengths, sliceRowLengths ); - - // return this->allocateMatrixElements( this->slicePointers.getElement( slices ) ); - Containers::Vector< Real, Device, Index > valuesVector; - Containers::Vector< Index, Device, Index > columnsVector; - valuesVector.setSize( slicePointers.getElement( slices ) ); - columnsVector.setSize( slicePointers.getElement( slices ) ); - columnsVector.setValue( this->getPaddingIndex() ); - valuesVector.setValue( 0.0 ); - - for( IndexType slice = 0; slice < slices; slice++ ) - { - IndexType step = 1; - IndexType slicePointerOrig = this->slicePointers.getElement( slice ); - IndexType rowLengthOrig = this->sliceRowLengths.getElement( slice ); - for( IndexType row = slice * SliceSize; row < (slice + 1) * SliceSize && row < this->getRows(); row++ ) - { - IndexType rowBegin = slicePointerOrig + rowLengthOrig * ( row - slice * SliceSize ); - IndexType elementPointer = rowBegin; - - IndexType sliceNew = this->permutationArray.getElement( row ) / SliceSize; - IndexType slicePointerNew = slicePointers.getElement( sliceNew ); - IndexType rowLengthNew = sliceRowLengths.getElement( sliceNew ); - IndexType elementPointerNew = slicePointerNew + rowLengthNew * ( this->permutationArray.getElement( row ) - sliceNew * SliceSize ); - - for( IndexType i = 0; i < rowLengthOrig; i++ ) - { - if( this->columnIndexes.getElement( elementPointer ) != this->getPaddingIndex() ) - { - valuesVector.setElement(elementPointerNew, this->values.getElement(elementPointer)); - columnsVector.setElement(elementPointerNew, this->columnIndexes.getElement(elementPointer)); - elementPointer += step; - } - elementPointerNew += step; - } - } - } - - // reset original matrix - this->values.reset(); - this->columnIndexes.reset(); - this->slicePointers.reset(); - this->sliceRowLengths.reset(); - - this->slicePointers.setSize( slicePointers.getSize() ); - this->sliceRowLengths.setSize( sliceRowLengths.getSize() ); - - this->sliceRowLengths = sliceRowLengths; - this->slicePointers = slicePointers; - - // deep copy new matrix - this->values.setSize( valuesVector.getSize() ); - this->columnIndexes.setSize( columnsVector.getSize() ); - this->values = valuesVector; - this->columnIndexes = columnsVector; - - // clear memory - valuesVector.reset(); - columnsVector.reset(); - slicePointers.reset(); - sliceRowLengths.reset(); - - this->rearranged = true; - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::help( bool verbose ) -{ - if( !this->rearranged ) - this->rearrangeMatrix( verbose ); - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getSlicePointers() -{ - return this->slicePointers; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getSliceRowLengths() -{ - return this->sliceRowLengths; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getPermutationArray() -{ - return this->permutationArray; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getInversePermutationArray() -{ - return this->inversePermutationArray; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getColorPointers() -{ - return this->colorPointers; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::copyFromHostToCuda( SlicedEllpackSymmetricGraph& matrix ) -{ - Sparse< Real, Device, Index >::copyFromHostToCuda( matrix ); - - this->rearranged = true; - - Containers::Vector< Index, Device, Index > colorPointers = matrix.getColorPointers(); - this->colorPointers.setSize( colorPointers.getSize() ); - for( IndexType i = 0; i < colorPointers.getSize(); i++ ) - this->colorPointers.setElement( i, colorPointers[ i ] ); - - Containers::Vector< Index, Device, Index > slicePointers = matrix.getSlicePointers(); - this->slicePointers.setSize( slicePointers.getSize() ); - for( IndexType i = 0; i < slicePointers.getSize(); i++ ) - this->slicePointers.setElement( i, slicePointers[ i ] ); - - Containers::Vector< Index, Device, Index > sliceRowLengths = matrix.getSliceRowLengths(); - this->sliceRowLengths.setSize( sliceRowLengths.getSize() ); - for( IndexType i = 0; i < sliceRowLengths.getSize(); i++ ) - this->sliceRowLengths.setElement( i, sliceRowLengths[ i ] ); - - Containers::Vector< Index, Device, Index > permutationArray = matrix.getPermutationArray(); - this->permutationArray.setSize( permutationArray.getSize() ); - for( IndexType i = 0; i < permutationArray.getSize(); i++ ) - this->permutationArray.setElement( i, permutationArray[ i ] ); - - Containers::Vector< Index, Device, Index > inversePermutation = matrix.getInversePermutationArray(); - this->inversePermutationArray.setSize( inversePermutation.getize() ); - for( IndexType i = 0; i < inversePermutation.getSize(); i++ ) - this->inversePermutationArray.setElement( i, inversePermutation[ i ] ); - - for( IndexType i = 0; i < this->getRows(); i++ ) - for( IndexType j = 0; j <= i; j++ ) - { - if( matrix.getElement( i, j ) != 0.0 ) - this->setElementFast( i, j, matrix.getElement( i, j ) ); - } - - colorPointers.reset(); - slicePointers.reset(); - sliceRowLengths.reset(); - permutationArray.reset(); - inversePermutation.reset(); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -template< typename InVector, - typename OutVector > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::vectorProductHost( const InVector& inVector, - OutVector& outVector ) const -{ - // simulated cuda SPMV on CPU - for( IndexType i = 0; i < this->getNumberOfColors(); i++ ) - { - IndexType offset = this->colorPointers[ i ]; - IndexType stop = this->colorPointers[ i + 1 ]; - IndexType inSliceIdx = offset % SliceSize; - IndexType sliceOffset = offset - inSliceIdx; - IndexType length = this->colorPointers[ i + 1 ] - this->colorPointers[ i ] + inSliceIdx; - IndexType cudaBlockSize = 256; - IndexType blocks = roundUpDivision( length, cudaBlockSize ); - for( IndexType blockIdx = 0; blockIdx < blocks; blockIdx++ ) - { - for( IndexType warpIdx = 0; warpIdx < 8; warpIdx++ ) - { - IndexType warpSize = 32; - for (IndexType threadIdx = 0; threadIdx < warpSize; threadIdx++) { - IndexType row = blockIdx * cudaBlockSize + warpIdx * warpSize + threadIdx + sliceOffset; - if (row >= stop || row < offset) - continue; - IndexType sliceIdx = row / SliceSize; - IndexType sliceLength = this->sliceRowLengths[sliceIdx]; - IndexType begin = this->slicePointers[sliceIdx] + sliceLength * threadIdx; - IndexType rowMapping = this->inversePermutationArray.getElement(row); - for (IndexType elementPtr = begin; elementPtr < begin + sliceLength; elementPtr++) { - IndexType column = this->columnIndexes[elementPtr]; - if (column == this->getPaddingIndex()) - break; - outVector[rowMapping] += inVector[column] * this->values[elementPtr]; - if (rowMapping != column) - { - outVector[column] += inVector[rowMapping] * this->values[elementPtr]; - } - } - } - } - } - } -} - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__device__ void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths, - const IndexType sliceIdx ) -{ - Index rowIdx = sliceIdx * SliceSize; - Index rowInSliceIdx( 0 ); - Index maxRowLength( 0 ); - if( rowIdx >= this->getRows() ) - return; - while( rowInSliceIdx < SliceSize && rowIdx < this->getRows() ) - { - maxRowLength = Max( maxRowLength, rowLengths[ rowIdx ] ); - rowIdx++; - rowInSliceIdx++; - } - this->sliceRowLengths[ sliceIdx ] = maxRowLength; - this->slicePointers[ sliceIdx ] = maxRowLength * SliceSize; - if( threadIdx.x == 0 ) - this->slicePointers[ this->slicePointers.getSize() - 1 ] = 0; - -} -#endif - -template<> -class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Host > -{ - public: - - typedef Devices::Host Device; - - template< typename Real, - typename Index, - int SliceSize > - static void initRowTraverse( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize; - const Index slicePointer = matrix.slicePointers.getElement( sliceIdx ); - const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx ); - - rowBegin = slicePointer + rowLength * ( matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize ); - rowEnd = rowBegin + rowLength; - step = 1; - } - - template< typename Real, - typename Index, - int SliceSize > - static void initRowTraverseFast( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize; - const Index slicePointer = matrix.slicePointers[ sliceIdx ]; - const Index rowLength = matrix.sliceRowLengths[ sliceIdx ]; - - rowBegin = slicePointer + rowLength * ( matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize ); - rowEnd = rowBegin + rowLength; - step = 1; - } - - - template< typename Real, - typename Index, - int SliceSize > - static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - typename SlicedEllpackSymmetricGraph< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths, - Containers::Vector< Index, Device, Index >& sliceRowLengths, - Containers::Vector< Index, Device, Index >& slicePointers ) - { - /*Index row( 0 ), slice( 0 ), sliceRowLength( 0 ); - while( row < matrix.getRows() ) - { - sliceRowLength = Max( rowLengths.getElement( matrix.permutationArray.getElement( row++ ) ), sliceRowLength ); - if( row % SliceSize == 0 ) - { - sliceRowLengths.setElement( slice, sliceRowLength ); - slicePointers.setElement( slice++, sliceRowLength * SliceSize ); - sliceRowLength = 0; - } - } - if( row % SliceSize != 0 ) - { - sliceRowLengths.setElement( slice, sliceRowLength ); - slicePointers.setElement( slice++, sliceRowLength * SliceSize ); - } - slicePointers.setElement( slicePointers.getSize() - 1, 0 );*/ - - Index sliceRowLength( 0 ); - Index numberOSlices = roundUpDivision( matrix.getRows(), SliceSize ); - Containers::Vector< Index, Device, Index > rowMapToSlice; - rowMapToSlice.setSize( SliceSize ); - for( Index slice = 0; slice < numberOSlices; slice++ ) - { - rowMapToSlice.setValue( -1 ); - Index elementPtr = 0; - for( Index row = 0; row < matrix.getRows() && elementPtr < SliceSize; row++ ) - { - if( matrix.permutationArray.getElement( row ) >= slice * SliceSize && - matrix.permutationArray.getElement( row ) < ( slice + 1 ) * SliceSize ) - { - rowMapToSlice.setElement( elementPtr, row ); - elementPtr++; - } - } - - // TODO: pridej sem nejaky logger! - - Index i = 0; - for( ; i < SliceSize; i++ ) - // sliceRowLength = Max( rowLengths.getElement( matrix.permutationArray.getElement( rowMapToSlice.getElement( row ) ) ), sliceRowLength ); - { - if( rowMapToSlice.getElement( i ) < 0 ) - break; - sliceRowLength = Max( rowLengths.getElement( rowMapToSlice.getElement( i ) ), sliceRowLength ); - } - if( i % SliceSize == 0 || rowMapToSlice.getElement( i ) < 0 ) - { - sliceRowLengths.setElement( slice, sliceRowLength ); - slicePointers.setElement( slice, sliceRowLength * SliceSize ); - sliceRowLength = 0; - } - } - slicePointers.setElement( slicePointers.getSize() - 1, 0 ); - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector, - int SliceSize > - static void vectorProduct( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - const InVector& inVector, - OutVector& outVector ) - { - matrix.vectorProductHost( inVector, outVector ); - } - -}; - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int SliceSize > -__global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix, - typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVector rowLengths, - int gridIdx ) -{ - const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; - matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx ); -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int SliceSize > -template< typename InVector, - typename OutVector > -__device__ -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::spmvCuda( const InVector& inVector, - OutVector& outVector, - const int globalIdx, - const int color ) const -{ - /*const IndexType offset = this->colorPointers[ i ]; - const IndexType stop = this->colorPointers[ i + 1 ]; - if( globalIdx >= stop || globalIdx < offset ) - return;*/ - - IndexType inSliceIdx = threadIdx.x % SliceSize; - const IndexType sliceIdx = globalIdx / SliceSize; - const IndexType sliceLength = this->sliceRowLengths[ sliceIdx ]; - const IndexType begin = this->slicePointers[ sliceIdx ] + inSliceIdx * sliceLength; - const IndexType rowMapping = this->inversePermutationArray[ globalIdx ]; - for( IndexType elementPtr = begin; elementPtr < begin + sliceLength; elementPtr++ ) - { - IndexType column = this->columnIndexes[ elementPtr ]; - if( column == this->getPaddingIndex() ) - break; - - outVector[ rowMapping ] += inVector[ column ] * this->values[ elementPtr ]; - if( rowMapping != column ) - { - outVector[ column ] += inVector[ rowMapping ] * this->values[ elementPtr ]; - } - } -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int SliceSize, - typename InVector, - typename OutVector > -__global__ -void SlicedEllpackSymmetricGraphVectorProductCuda( const SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >& matrix, - const InVector* inVector, - OutVector* outVector, - const int gridIdx, - const int color, - const int sliceOffset ) -{ - int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + sliceOffset; - matrix->smvCuda( *inVector, *outVector, globalIdx, color ); -} -#endif - -template<> -class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda > -{ - public: - - typedef Devices::Cuda Device; - - template< typename Real, - typename Index, - int SliceSize > - static void initRowTraverse( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize; - const Index slicePointer = matrix.slicePointers.getElement( sliceIdx ); - const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx ); - - rowBegin = slicePointer + matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize; - rowEnd = rowBegin + rowLength * SliceSize; - step = SliceSize; - } - - template< typename Real, - typename Index, - int SliceSize > - __cuda_callable__ - static void initRowTraverseFast( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize; - const Index slicePointer = matrix.slicePointers[ sliceIdx ]; - const Index rowLength = matrix.sliceRowLengths[ sliceIdx ]; - - rowBegin = slicePointer + matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize; - rowEnd = rowBegin + rowLength * SliceSize; - step = SliceSize; - - } - - template< typename Real, - typename Index, - int SliceSize > - static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - typename SlicedEllpackSymmetricGraph< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths, - Containers::Vector< Index, Device, Index >& sliceRowLengths, - Containers::Vector< Index, Device, Index >& slicePointers ) - { -#ifdef HAVE_CUDA - typedef SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > Matrix; - typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_matrix = Cuda::passToDevice( matrix ); - const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>> - ( kernel_matrix, - rowLengths, - gridIdx ); - } - Cuda::freeFromDevice( kernel_matrix ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector, - int SliceSize > - static void vectorProduct( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - const InVector& inVector, - OutVector& outVector ) - { - // TODO: tohle -#ifdef HAVE_CUDA - typedef SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - for( IndexType color = 0; color < matrix.getNumberOfColors(); color++ ) - { - IndexType offset = matrix.colorPointers.getElement( color ); //can be computed in kernel - // IndexType rowStop = matrix.colorPointers.getElement( color + 1 ); can be computed in kernel - IndexType inSliceOffset = offset % SliceSize; - // TODO: inSliceIdx is undefined - //IndexType rows = matrix.colorPointers.getElement( color + 1 ) - matrix.colorPointers.getElement( color ) + inSliceIdx; - // TODO: rows id undefined - /*const IndexType cudaBlocks = roundUpDivision( rows, cudaBlockSize.x ); - const IndexType cudaGrids = rondUpDivision( cudaBlocks, Cuda::getMaxGridSize ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - // TODO: this cannot be used here and i is undefined - //IndexType offset = this->colorPointers[ i ]; - IndexType inSliceIdx = offset % SliceSize; - IndexType sliceOffset = offset - inSliceIdx; - SlicedEllpackSymmetricGraphVectorProductCuda< Real, Index, InVector, OutVector > - <<< cudaGridSize, cudaBlockSize >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx, - color, - sliceOffset ); - }*/ - } - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_inVector ); - Cuda::freeFromDevice( kernel_outVector ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h deleted file mode 100644 index 46475ac20..000000000 --- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h +++ /dev/null @@ -1,930 +0,0 @@ -/*************************************************************************** - SlocedEllpackSymmetric_impl.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::SlicedEllpackSymmetric() -{ -}; - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -String SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getType() -{ - return String( "Matrices::SlicedEllpackSymmetric< ") + - String( TNL::getType< Real >() ) + - String( ", " ) + - String( Device :: getDeviceType() ) + - String( ", " ) + - String( TNL::getType< Index >() ) + - String( " >" ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -String SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getTypeVirtual() const -{ - return this->getType(); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setDimensions( const IndexType rows, - const IndexType columns ) -{ - TNL_ASSERT( rows > 0 && columns > 0, - std::cerr << "rows = " << rows - << " columns = " << columns <::setDimensions( rows, columns ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) -{ - TNL_ASSERT( this->getRows() > 0, ); - TNL_ASSERT( this->getColumns() > 0, ); - const IndexType slices = roundUpDivision( this->rows, SliceSize ); - this->sliceRowLengths.setSize( slices ); - this->slicePointers.setSize( slices + 1 ); - - // TODO: Uncomment the next line and fix the compilation - //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths ); - - throw std::runtime_error("code fix required"); - - this->maxRowLength = max( rowLengths ); - - this->slicePointers.template scan< Algorithms::ScanType::Exclusive >(); - this->allocateMatrixElements( this->slicePointers.getElement( slices ) ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Index SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getRowLength( const IndexType row ) const -{ - const IndexType slice = roundUpDivision( row, SliceSize ); - return this->sliceRowLengths[ slice ]; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setLike( const SlicedEllpackSymmetric< Real2, Device2, Index2, SliceSize >& matrix ) -{ - if( !Sparse< Real, Device, Index >::setLike( matrix ) || - ! this->slicePointers.setLike( matrix.slicePointers ) || - ! this->sliceRowLengths.setLike( matrix.sliceRowLengths ) ) - return false; - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::reset() -{ - Sparse< Real, Device, Index >::reset(); - this->slicePointers.reset(); - this->sliceRowLengths.reset(); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::operator == ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const -{ - TNL_ASSERT( this->getRows() == matrix.getRows() && - this->getColumns() == matrix.getColumns(), - std::cerr << "this->getRows() = " << this->getRows() - << " matrix.getRows() = " << matrix.getRows() - << " this->getColumns() = " << this->getColumns() - << " matrix.getColumns() = " << matrix.getColumns() - << " this->getName() = " << this->getName() - << " matrix.getName() = " << matrix.getName() ); - // TODO: implement this - throw Exceptions::NotImplementedError( "SlicedEllpackSymmetric::operator== is not implemented." ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::operator != ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const -{ - return ! ( ( *this ) == matrix ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setElementFast( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElementFast( row, column, value, 0.0 ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setElement( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElement( row, column, value, 0.0 ); -} - - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - TNL_ASSERT( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, - std::cerr << " row = " << row - << " column = " << column - << " this->rows = " << this->rows - << " this->columns = " << this-> columns ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes[ elementPtr ] ) < column && - col != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr == rowEnd ) - return false; - if( col == column ) - { - this->values[ elementPtr ] = thisElementMultiplicator * this->values[ elementPtr ] + value; - return true; - } - if( col == this->getPaddingIndex() ) - { - this->columnIndexes[ elementPtr ] = column; - this->values[ elementPtr ] = value; - return true; - } - IndexType j = rowEnd - step; - while( j > elementPtr ) - { - this->columnIndexes[ j ] = this->columnIndexes[ j - step ]; - this->values[ j ] = this->values[ j - step ]; - j -= step; - } - this->columnIndexes[ elementPtr ] = column; - this->values[ elementPtr ] = value; - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - TNL_ASSERT( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, - std::cerr << " row = " << row - << " column = " << column - << " this->rows = " << this->rows - << " this->columns = " << this-> columns ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes.getElement( elementPtr ) ) < column && - col != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr == rowEnd ) - return false; - if( col == column ) - { - this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value ); - return true; - } - if( col == this->getPaddingIndex() ) - { - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; - } - IndexType j = rowEnd - step; - while( j > elementPtr ) - { - this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) ); - this->values.setElement( j, this->values.getElement( j - step ) ); - j -= step; - } - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - const IndexType sliceIdx = row / SliceSize; - const IndexType rowLength = this->sliceRowLengths[ sliceIdx ]; - if( elements > rowLength ) - return false; - - Index elementPointer, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPointer, rowEnd, step ); - - for( IndexType i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes[ elementPointer ] = columnIndexes[ i ]; - this->values[ elementPointer ] = values[ i ]; - elementPointer += step; - } - for( IndexType i = elements; i < rowLength; i++ ) - { - this->columnIndexes[ elementPointer ] = this->getPaddingIndex(); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - const IndexType sliceIdx = row / SliceSize; - const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx ); - if( elements > rowLength ) - return false; - - Index elementPointer, rowEnd, step; - DeviceDependentCode::initRowTraverse( *this, row, elementPointer, rowEnd, step ); - - for( IndexType i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes.setElement( elementPointer, column ); - this->values.setElement( elementPointer, values[ i ] ); - elementPointer += step; - } - for( IndexType i = elements; i < rowLength; i++ ) - { - this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() ); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - // TODO: implement - return false; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - // TODO: implement - return false; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -Real SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getElementFast( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElementFast( column, row ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes[ elementPtr ] ) < column && - col != this->getPaddingIndex() ) - elementPtr += step; - if( elementPtr < rowEnd && col == column ) - return this->values[ elementPtr ]; - return 0.0; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Real SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getElement( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElement( column, row ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes.getElement( elementPtr ) ) < column && - col != this->getPaddingIndex() ) - elementPtr += step; - if( elementPtr < rowEnd && col == column ) - return this->values.getElement( elementPtr ); - return 0.0; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - Index elementPtr, rowEnd, step, i( 0 ); - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - while( elementPtr < rowEnd ) - { - columns[ i ] = this->columnIndexes[ elementPtr ]; - values[ i ] = this->values[ elementPtr ]; - elementPtr += step; - i++; - } -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getRow( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - Index elementPtr, rowEnd, step, i( 0 ); - DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); - - while( elementPtr < rowEnd ) - { - columns[ i ] = this->columnIndexes.getElement( elementPtr ); - values[ i ] = this->values.getElement( elementPtr ); - elementPtr += step; - i++; - } -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -template< typename InVector, - typename OutVector > -__cuda_callable__ -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::rowVectorProduct( const IndexType row, - const InVector& inVector, - OutVector& outVector ) const -{ - Real result = 0.0; - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - IndexType column; - while( elementPtr < rowEnd && - ( column = this->columnIndexes[ elementPtr ] ) < this->columns && - column != this->getPaddingIndex() ) - { - result += this->values[ elementPtr ] * inVector[ column ]; - if( row != column ) - outVector[ column ] += this->values[ elementPtr ] * inVector[ row ]; - elementPtr += step; - } - outVector[ row ] += result; -} - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int SliceSize > -template< typename InVector, - typename OutVector > -__device__ -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::spmvCuda( const InVector& inVector, - OutVector& outVector, - int rowIdx ) const -{ - if( rowIdx >= this->getRows() ) - return; - - Real result = 0.0; - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, rowIdx, elementPtr, rowEnd, step ); - IndexType column; - while( elementPtr < rowEnd && - ( column = this->columnIndexes[ elementPtr ] ) < this->columns && - column != this->getPaddingIndex() ) - { - result += this->values[ elementPtr ] * inVector[ column ]; - if( rowIdx != column ) - outVector[ column ] += this->values[ elementPtr ] * inVector[ rowIdx ]; - elementPtr += step; - } - outVector[ rowIdx ] += result; -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int SliceSize, - typename InVector, - typename OutVector > -__global__ -void SlicedEllpackSymmetricVectorProductCudaKernel( -const SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >* matrix, - const InVector* inVector, - OutVector* outVector, - int gridIdx ) -{ - int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - matrix->spmvCuda( *inVector, *outVector, rowIdx ); -} -#endif - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename InVector, - typename OutVector > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::vectorProduct( const InVector& inVector, - OutVector& outVector ) const -{ - DeviceDependentCode::vectorProduct( *this, inVector, outVector ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Index2 > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::addMatrix( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator, - const RealType& thisMatrixMultiplicator ) -{ - throw Exceptions::NotImplementedError( "SlicedEllpackSymmetric::addMatrix is not implemented." ); - // TODO: implement -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Index2 > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getTransposition( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator ) -{ - throw Exceptions::NotImplementedError( "SlicedEllpackSymmetric::getTransposition is not implemented." ); - // TODO: implement -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Vector > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::performSORIteration( const Vector& b, - const IndexType row, - Vector& x, - const RealType& omega ) const -{ - TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << this->getName() <sliceRowLengths[ sliceIdx ]; - IndexType elementPtr = this->slicePointers[ sliceIdx ] + - rowLength * ( row - sliceIdx * SliceSize ); - const IndexType rowEnd( elementPtr + rowLength ); - IndexType column; - while( elementPtr < rowEnd && ( column = this->columnIndexes[ elementPtr ] ) < this->columns ) - { - if( column == row ) - diagonalValue = this->values.getElement( elementPtr ); - else - sum += this->values.getElement( row * this->diagonalsShift.getSize() + elementPtr ) * x. getElement( column ); - elementPtr++; - } - if( diagonalValue == ( Real ) 0.0 ) - { - std::cerr << "There is zero on the diagonal in " << row << "-th row of thge matrix " << this->getName() << ". I cannot perform SOR iteration." < -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::save( File& file ) const -{ - Sparse< Real, Device, Index >::save( file ); - file << this->slicePointers << this->sliceRowLengths; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::load( File& file ) -{ - Sparse< Real, Device, Index >::load( file ); - file >> this->slicePointers >> this->sliceRowLengths; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::save( const String& fileName ) const -{ - Object::save( fileName ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::load( const String& fileName ) -{ - Object::load( fileName ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::print( std::ostream& str ) const -{ - for( IndexType row = 0; row < this->getRows(); row++ ) - { - str <<"Row: " << row << " -> "; - const IndexType sliceIdx = row / SliceSize; - const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx ); - IndexType elementPtr = this->slicePointers.getElement( sliceIdx ) + - rowLength * ( row - sliceIdx * SliceSize ); - const IndexType rowEnd( elementPtr + rowLength ); - while( elementPtr < rowEnd && - this->columnIndexes.getElement( elementPtr ) < this->columns && - this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) - { - const Index column = this->columnIndexes.getElement( elementPtr ); - str << " Col:" << column << "->" << this->values.getElement( elementPtr ) << "\t"; - elementPtr++; - } - str < -__device__ void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths, - const IndexType sliceIdx ) -{ - Index rowIdx = sliceIdx * SliceSize; - Index rowInSliceIdx( 0 ); - Index maxRowLength( 0 ); - if( rowIdx >= this->getRows() ) - return; - while( rowInSliceIdx < SliceSize && rowIdx < this->getRows() ) - { - maxRowLength = Max( maxRowLength, rowLengths[ rowIdx ] ); - rowIdx++; - rowInSliceIdx++; - } - this->sliceRowLengths[ sliceIdx ] = maxRowLength; - this->slicePointers[ sliceIdx ] = maxRowLength * SliceSize; - if( threadIdx.x == 0 ) - this->slicePointers[ this->slicePointers.getSize() - 1 ] = 0; - -} -#endif - -template<> -class SlicedEllpackSymmetricDeviceDependentCode< Devices::Host > -{ - public: - - typedef Devices::Host Device; - - template< typename Real, - typename Index, - int SliceSize > - static void initRowTraverse( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = row / SliceSize; - const Index slicePointer = matrix.slicePointers.getElement( sliceIdx ); - const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx ); - - rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize ); - rowEnd = rowBegin + rowLength; - step = 1; - } - - template< typename Real, - typename Index, - int SliceSize > - __cuda_callable__ - static void initRowTraverseFast( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = row / SliceSize; - const Index slicePointer = matrix.slicePointers[ sliceIdx ]; - const Index rowLength = matrix.sliceRowLengths[ sliceIdx ]; - - rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize ); - rowEnd = rowBegin + rowLength; - step = 1; - } - - - template< typename Real, - typename Index, - int SliceSize > - static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - typename SlicedEllpackSymmetric< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths ) - { - Index row( 0 ), slice( 0 ), sliceRowLength( 0 ); - while( row < matrix.getRows() ) - { - sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength ); - if( row % SliceSize == 0 ) - { - matrix.sliceRowLengths.setElement( slice, sliceRowLength ); - matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); - sliceRowLength = 0; - } - } - if( row % SliceSize != 0 ) - { - matrix.sliceRowLengths.setElement( slice, sliceRowLength ); - matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); - } - matrix.slicePointers.setElement( matrix.slicePointers.getSize() - 1, 0 ); - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector, - int SliceSize > - static void vectorProduct( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - const InVector& inVector, - OutVector& outVector ) - { - for( Index row = 0; row < matrix.getRows(); row++ ) - { - matrix.rowVectorProduct( row, inVector, outVector ); - } - } - -}; - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int SliceSize > -__global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >* matrix, - typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths, - int gridIdx ) -{ - const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; - matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx ); -} -#endif - -template<> -class SlicedEllpackSymmetricDeviceDependentCode< Devices::Cuda > -{ - public: - - typedef Devices::Cuda Device; - - template< typename Real, - typename Index, - int SliceSize > - static void initRowTraverse( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = row / SliceSize; - const Index slicePointer = matrix.slicePointers.getElement( sliceIdx ); - const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx ); - - rowBegin = slicePointer + row - sliceIdx * SliceSize; - rowEnd = rowBegin + rowLength * SliceSize; - step = SliceSize; - } - - template< typename Real, - typename Index, - int SliceSize > - __cuda_callable__ - static void initRowTraverseFast( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = row / SliceSize; - const Index slicePointer = matrix.slicePointers[ sliceIdx ]; - const Index rowLength = matrix.sliceRowLengths[ sliceIdx ]; - - rowBegin = slicePointer + row - sliceIdx * SliceSize; - rowEnd = rowBegin + rowLength * SliceSize; - step = SliceSize; - - } - - template< typename Real, - typename Index, - int SliceSize > - static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - typename SlicedEllpackSymmetric< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths ) - { -#ifdef HAVE_CUDA - typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix; - typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_matrix = Cuda::passToDevice( matrix ); - const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>> - ( kernel_matrix, - rowLengths, - gridIdx ); - } - Cuda::freeFromDevice( kernel_matrix ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector, - int SliceSize > - static void vectorProduct( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - const InVector& inVector, - OutVector& outVector ) - { -#ifdef HAVE_CUDA - typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - SlicedEllpackSymmetricVectorProductCudaKernel< Real, Index, SliceSize, InVector, OutVector > - <<< cudaGridSize, cudaBlockSize >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx ); - } - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_inVector ); - Cuda::freeFromDevice( kernel_outVector ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL -- GitLab From d2ff1d4bdf74041e8de3332a72487fe3de523b44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 5 Aug 2020 19:49:49 +0200 Subject: [PATCH 55/57] Deleted old SpMV benchmark. --- .../SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp | 14 - .../SpMV/OldSpMV/tnl-benchmark-old-spmv.cu | 12 - .../SpMV/OldSpMV/tnl-benchmark-old-spmv.h | 925 ------------------ .../SpMV/OldSpMV/tnlCusparseCSRMatrix.h | 162 --- 4 files changed, 1113 deletions(-) delete mode 100644 src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp delete mode 100644 src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu delete mode 100644 src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h delete mode 100644 src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp deleted file mode 100644 index c9cd17cda..000000000 --- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp +++ /dev/null @@ -1,14 +0,0 @@ -/*************************************************************************** - tnl-benchmark-spmv.cpp - description - ------------------- - begin : Jun 5, 2014 - copyright : (C) 2014 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - - -#include "tnl-benchmark-old-spmv.h" - - diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu deleted file mode 100644 index 433af970b..000000000 --- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu +++ /dev/null @@ -1,12 +0,0 @@ -/*************************************************************************** - tnl-benchmark-spmv.cu - description - ------------------- - begin : Jun 5, 2014 - copyright : (C) 2014 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - - -#include "tnl-benchmark-old-spmv.h" diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h deleted file mode 100644 index 455c7d412..000000000 --- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h +++ /dev/null @@ -1,925 +0,0 @@ -/*************************************************************************** - tnl-benchmark-spmv.h - description - ------------------- - begin : Jun 5, 2014 - copyright : (C) 2014 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#ifdef NOT_USED_ANYMORE - -#pragma once - -#include -#include -#include -#ifdef HAVE_CUDA -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "tnlCusparseCSRMatrix.h" - -using namespace std; -using namespace TNL; -using namespace TNL::Matrices; - -void setupConfig( Config::ConfigDescription& config ) -{ - config.addDelimiter ( "General settings:" ); - config.addRequiredEntry< String >( "test" , "Test to be performed." ); - config.addEntryEnum< String >( "mtx" ); - config.addEntryEnum< String >( "tnl" ); - config.addRequiredEntry< String >( "input-file" , "Input file name." ); - config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log"); - config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); - config.addEntry< double >( "stop-time", "Seconds to iterate the SpMV operation.", 1.0 ); - config.addEntry< int >( "verbose", "Verbose mode.", 1 ); -} - -bool initLogFile( std::fstream& logFile, const String& fileName ) -{ - if( access( fileName.getString(), F_OK ) == -1 ) - { - logFile.open( fileName.getString(), std::ios::out ); - if( ! logFile ) - return false; - const String fillingColoring = " : COLORING 0 #FFF8DC 20 #FFFF00 40 #FFD700 60 #FF8C0 80 #FF0000 100"; - const String speedupColoring = " : COLORING #0099FF 1 #FFFFFF 2 #00FF99 4 #33FF99 8 #33FF22 16 #FF9900"; - const String paddingColoring = " : COLORING #FFFFFF 1 #FFFFCC 10 #FFFF99 100 #FFFF66 1000 #FFFF33 10000 #FFFF00"; - logFile << "#Matrix file " << std::endl; - logFile << "#Rows" << std::endl; - logFile << "#Columns" << std::endl; - logFile << "#Non-zero elements" << std::endl; - logFile << "#Filling (in %)" << fillingColoring << std::endl; - logFile << "#CSR Format" << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << std::endl; -#ifdef HAVE_CUDA - logFile << "# Cusparse CSR" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - cusparse-csr-speedup.txt" << std::endl; - logFile << "# CUDA" << std::endl; - logFile << "# Scalar" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << std::endl; - logFile << "# Vector" << std::endl; - logFile << "# Warp Size 1" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-1-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 2" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-2-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 4" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-4-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 8" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-8-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 16" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-16-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 32" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-32-cuda-speedup.txt" << std::endl; - logFile << "# Hybrid" << std::endl; - logFile << "# Split 2" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-2-cuda-speedup.txt" << std::endl; - logFile << "# Split 4" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-4-cuda-speedup.txt" << std::endl; - logFile << "# Split 8" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-8-cuda-speedup.txt" << std::endl; - logFile << "# Split 16" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-16-cuda-speedup.txt" << std::endl; - logFile << "# Split 32" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-32-cuda-speedup.txt" << std::endl; - logFile << "# Split 64" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-64-cuda-speedup.txt" << std::endl; -#endif - logFile << "#Ellpack Format" << std::endl; - logFile << "# Padding (in %)" << paddingColoring << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - ellpack-host-speedup.txt" << std::endl; -#ifdef HAVE_CUDA - logFile << "# CUDA" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - ellpack-cuda-speedup.txt" << std::endl; -#endif - logFile << "#SlicedEllpack Format" << std::endl; - logFile << "# Padding (in %)" << paddingColoring << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - sliced-ellpack-host-speedup.txt" << std::endl; -#ifdef HAVE_CUDA - logFile << "# CUDA" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - sliced-ellpack-cuda-speedup.txt" << std::endl; -#endif - logFile << "#ChunkedEllpack Format" << std::endl; - logFile << "# Padding (in %)" << paddingColoring << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - chunked-ellpack-host-speedup.txt" << std::endl; -#ifdef HAVE_CUDA - logFile << "# CUDA" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - chunked-ellpack-cuda-speedup.txt" << std::endl; -#endif - return true; - } - logFile.open( fileName.getString(), std::ios::out | std::ios::app ); - //logFile << std::setprecision( 2 ); - if( ! logFile ) - return false; - return true; -} - -template< typename Matrix > -void printMatrixInfo( const String& inputFileName, - const Matrix& matrix, - std::ostream& str ) -{ - str << " Rows: " << std::setw( 8 ) << matrix.getRows(); - str << " Columns: " << std::setw( 8 ) << matrix.getColumns(); - str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements(); - const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements(); - str << " Filling: " << std::setw( 5 ) << 100.0 * fillingRatio << "%" << std::endl; - str << std::setw( 25 ) << "Format" - << std::setw( 15 ) << "Padding" - << std::setw( 15 ) << "Time" - << std::setw( 15 ) << "GFLOPS" - << std::setw( 15 ) << "Throughput" - << std::setw( 15 ) << "Speedup" << std::endl; -} - -template< typename Matrix > -bool writeMatrixInfo( const String& inputFileName, - const Matrix& matrix, - std::ostream& logFile ) -{ - logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << " " << matrix.getRows() << std::endl; - logFile << " " << matrix.getColumns() << std::endl; - logFile << " " << matrix.getNumberOfNonzeroMatrixElements() << std::endl; - const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements(); - logFile << " " << 100.0 * fillingRatio << std::endl; - logFile << std::flush; - if( ! logFile.good() ) - return false; - return true; -} - -double computeGflops( const long int nonzeroElements, - const int iterations, - const double& time ) -{ - return ( double ) ( 2 * iterations * nonzeroElements ) / time * 1.0e-9; -} - -template< typename Real > -double computeThroughput( const long int nonzeroElements, - const int iterations, - const int rows, - const double& time ) -{ - return ( double ) ( ( 2 * nonzeroElements + rows ) * iterations ) * sizeof( Real ) / time * 1.0e-9; -} - -template< typename Matrix, - typename Vector > -double benchmarkMatrix( const Matrix& matrix, - const Vector& x, - Vector& b, - const long int nonzeroElements, - const char* format, - const double& stopTime, - const double& baseline, - int verbose, - std::fstream& logFile ) -{ - Timer timer; - timer.start(); - double time( 0.0 ); - int iterations( 0 ); - while( time < stopTime ) - { - matrix.vectorProduct( x, b ); -#ifdef HAVE_CUDA - if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - time = timer.getRealTime(); - iterations++; - } - const double gflops = computeGflops( nonzeroElements, iterations, time ); - const double throughput = computeThroughput< typename Matrix::RealType >( nonzeroElements, iterations, matrix.getRows(), time ); - const long int allocatedElements = matrix.getNumberOfMatrixElements(); - const double padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - if( verbose ) - { - std::cout << std::setw( 25 ) << format - << std::setw( 15 ) << padding - << std::setw( 15 ) << time - << std::setw( 15 ) << gflops - << std::setw( 15 ) << throughput; - if( baseline ) - std::cout << std::setw( 15 ) << gflops / baseline << std::endl; - else - std::cout << std::setw( 15 ) << "N/A" << std::endl; - } - logFile << " " << gflops << std::endl; - logFile << " " << throughput << std::endl; - if( baseline ) - logFile << gflops / baseline << std::endl; - else - logFile << "N/A" << std::endl; - return gflops; -} - -void writeTestFailed( std::fstream& logFile, - int repeat ) -{ - for( int i = 0; i < repeat; i++ ) - logFile << "N/A" << std::endl; -} - -template< typename Real > -bool setupBenchmark( const Config::ParameterContainer& parameters ) -{ - const String& test = parameters.getParameter< String >( "test" ); - const String& inputFileName = parameters.getParameter< String >( "input-file" ); - const String& logFileName = parameters.getParameter< String >( "log-file" ); - const int verbose = parameters.getParameter< int >( "verbose" ); - const double stopTime = parameters.getParameter< double >( "stop-time" ); - std::fstream logFile; - if( ! initLogFile( logFile, logFileName ) ) - { - std::cerr << "I am not able to open the file " << logFileName << "." << std::endl; - return false; - } - if( test == "mtx" ) - { - typedef Matrices::CSR< Real, Devices::Host, int > CSRType; - CSRType csrMatrix; - try - { - if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) ) - { - std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl; - logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << "Benchmark failed: Unable to read the matrix." << std::endl; - return false; - } - } - catch( std::bad_alloc ) - { - std::cerr << "Not enough memory to read the matrix." << std::endl; - logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << "Benchmark failed: Not enough memory." << std::endl; - return false; - } - if( verbose ) - printMatrixInfo( inputFileName, csrMatrix,std::cout ); - if( ! writeMatrixInfo( inputFileName, csrMatrix, logFile ) ) - { - std::cerr << "I am not able to write new matrix to the log file." << std::endl; - return false; - } - const int rows = csrMatrix.getRows(); - const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements(); - Containers::Vector< int, Devices::Host, int > rowLengthsHost; - rowLengthsHost.setSize( rows ); - for( int row = 0; row < rows; row++ ) - rowLengthsHost[ row ] = csrMatrix.getRowLength( row ); - - typedef Containers::Vector< Real, Devices::Host, int > HostVector; - HostVector hostX, hostB; - hostX.setSize( csrMatrix.getColumns() ); - hostX.setValue( 1.0 ); - hostB.setSize( csrMatrix.getRows() ); -#ifdef HAVE_CUDA - typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; - CudaVector cudaX, cudaB; - Containers::Vector< int, Devices::Cuda, int > rowLengthsCuda; - cudaX.setSize( csrMatrix.getColumns() ); - cudaX.setValue( 1.0 ); - cudaB.setSize( csrMatrix.getRows() ); - rowLengthsCuda.setSize( csrMatrix.getRows() ); - rowLengthsCuda = rowLengthsHost; - cusparseHandle_t cusparseHandle; - cusparseCreate( &cusparseHandle ); -#endif - const double baseline = benchmarkMatrix( csrMatrix, - hostX, - hostB, - nonzeroElements, - "CSR Host", - stopTime, - 0.0, - verbose, - logFile ); -#ifdef HAVE_CUDA - typedef CSR< Real, Devices::Cuda, int > CSRCudaType; - CSRCudaType cudaCSR; - //cout << "Copying matrix to GPU... "; - cudaCSR = csrMatrix; - TNL::CusparseCSR< Real > cusparseCSR; - cusparseCSR.init( cudaCSR, &cusparseHandle ); - benchmarkMatrix( cusparseCSR, - cudaX, - cudaB, - nonzeroElements, - "Cusparse CSR", - stopTime, - baseline, - verbose, - logFile ); - cusparseDestroy( cusparseHandle ); - - std::cout << " done. \r"; - /*cudaCSR.setCudaKernelType( CSRCudaType::scalar ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Scalar", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaKernelType( CSRCudaType::vector ); - cudaCSR.setCudaWarpSize( 1 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 1", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 2 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 2", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 4 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 4", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 8 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 8", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 16 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 16", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 32 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 32", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaKernelType( CSRCudaType::hybrid ); - cudaCSR.setHybridModeSplit( 2 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 2", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 4 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 4", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 8 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 8", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 16 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 16", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 32 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 32", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 64 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 64", - stopTime, - baseline, - verbose, - logFile );*/ - cudaCSR.reset(); -#endif - - long int allocatedElements; - double padding; - typedef Ellpack< Real, Devices::Host, int > EllpackType; - EllpackType ellpackMatrix; - Matrices::copySparseMatrix( ellpackMatrix, csrMatrix ); - allocatedElements = ellpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding << std::endl; - benchmarkMatrix( ellpackMatrix, - hostX, - hostB, - nonzeroElements, - "Ellpack Host", - stopTime, - baseline, - verbose, - logFile ); -#ifdef HAVE_CUDA - typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType; - EllpackCudaType cudaEllpack; - std::cout << "Copying matrix to GPU... "; - cudaEllpack = ellpackMatrix; - std::cout << " done. \r"; - benchmarkMatrix( cudaEllpack, - cudaX, - cudaB, - nonzeroElements, - "Ellpack Cuda", - stopTime, - baseline, - verbose, - logFile ); - cudaEllpack.reset(); -#endif - ellpackMatrix.reset(); - - typedef Matrices::EllpackSymmetric< Real, Devices::Host, int > EllpackSymmetricType; - EllpackSymmetricType EllpackSymmetric; - if( ! MatrixReader< EllpackSymmetricType >::readMtxFile( inputFileName, EllpackSymmetric, verbose, true ) ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = EllpackSymmetric.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < EllpackSymmetricCudaType; - EllpackSymmetricCudaType cudaEllpackSymmetric; - std::cout << "Copying matrix to GPU... "; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = EllpackSymmetric.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - - // TODO: fix this - //if( ! cudaEllpackSymmetric.copyFrom( EllpackSymmetric, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." < SlicedEllpackMatrixType; - SlicedEllpackMatrixType slicedEllpackMatrix; - if( ! Matrices::MatrixReader< SlicedEllpackMatrixType >::readMtxFile( inputFileName, slicedEllpackMatrix, verbose ) ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = slicedEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100; - logFile << " " << padding < SlicedEllpackMatrixCudaType; - SlicedEllpackMatrixCudaType cudaSlicedEllpackMatrix; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = slicedEllpackMatrix.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fix - //if( ! cudaSlicedEllpackMatrix.copyFrom( slicedEllpackMatrix, rowLengthsCuda ) ) - { - std::cerr << "Nejde zkopirovat" < ChunkedEllpackType; - ChunkedEllpackType chunkedEllpack; - Matrices::copySparseMatrix( chunkedEllpack, csrMatrix ); - allocatedElements = chunkedEllpack.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding << std::endl; - benchmarkMatrix( chunkedEllpack, - hostX, - hostB, - nonzeroElements, - "ChunkedEllpack Host", - stopTime, - baseline, - verbose, - logFile ); - -#ifdef HAVE_CUDA - typedef Matrices::ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType; - ChunkedEllpackCudaType cudaChunkedEllpack; - std::cout << "Copying matrix to GPU... "; - cudaChunkedEllpack = chunkedEllpack; - std::cout << " done. \r"; - benchmarkMatrix( cudaChunkedEllpack, - cudaX, - cudaB, - nonzeroElements, - "ChunkedEllpack Cuda", - stopTime, - baseline, - verbose, - logFile ); - cudaChunkedEllpack.reset(); -#endif - - typedef Matrices::BiEllpack< Real, Devices::Host, int > BiEllpackMatrixType; - BiEllpackMatrixType biEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - // copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - /*if( ! biEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) - writeTestFailed( logFile, 7 ); - else*/ - { - allocatedElements = biEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < BiEllpackMatrixCudaType; - BiEllpackMatrixCudaType cudaBiEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - // copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - std::cout << "Copying matrix to GPU... "; - /*if( ! cudaBiEllpackMatrix.copyFrom( biEllpackMatrix, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." < SlicedEllpackSymmetricType; - SlicedEllpackSymmetricType slicedEllpackSymmetric; - if( ! Matrices::MatrixReader< SlicedEllpackSymmetricType >::readMtxFile( inputFileName, slicedEllpackSymmetric, verbose, true ) ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = slicedEllpackSymmetric.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < SlicedEllpackSymmetricCudaType; - SlicedEllpackSymmetricCudaType cudaSlicedEllpackSymmetric; - std::cout << "Copying matrix to GPU... "; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = slicedEllpackSymmetric.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fiox the nest line - //if( ! cudaSlicedEllpackSymmetric.copyFrom( slicedEllpackSymmetric, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." < EllpackSymmetricGraphMatrixType; - EllpackSymmetricGraphMatrixType EllpackSymmetricGraphMatrix; - if( ! Matrices::MatrixReader< EllpackSymmetricGraphMatrixType >::readMtxFile( inputFileName, EllpackSymmetricGraphMatrix, verbose, true ) || - ! EllpackSymmetricGraphMatrix.help() ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = EllpackSymmetricGraphMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < EllpackSymmetricGraphMatrixCudaType; - EllpackSymmetricGraphMatrixCudaType cudaEllpackSymmetricGraphMatrix; - std::cout << "Copying matrix to GPU... "; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = EllpackSymmetricGraphMatrix.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fix it - //if( ! cudaEllpackSymmetricGraphMatrix.copyFrom( EllpackSymmetricGraphMatrix, rowLengthsCuda ) ) - { - writeTestFailed( logFile, 3 ); - } - //else if( ! cudaEllpackSymmetricGraphMatrix.help() ) - { - writeTestFailed( logFile, 3 ); - } - //else - { - std::cout << " done. \r"; - benchmarkMatrix( cudaEllpackSymmetricGraphMatrix, - cudaX, - cudaB, - nonzeroElements, - "Ellpack Graph Cuda", - stopTime, - baseline, - verbose, - logFile ); - } - cudaEllpackSymmetricGraphMatrix.reset(); -#endif - } - - - typedef Matrices::AdEllpack< Real, Devices::Host, int > AdEllpackMatrixType; - AdEllpackMatrixType adEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - /*if( ! adEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) - writeTestFailed( logFile, 7 ); - else*/ - { - allocatedElements = adEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < AdEllpackMatrixCudaType; - AdEllpackMatrixCudaType cudaAdEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - std::cout << "Copying matrix to GPU... "; - /*if( ! cudaAdEllpackMatrix.copyFrom( csrMatrix, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." <( "precision" ); - if( precision == "float" ) - if( ! setupBenchmark< float >( parameters ) ) - return EXIT_FAILURE; - if( precision == "double" ) - if( ! setupBenchmark< double >( parameters ) ) - return EXIT_FAILURE; - return EXIT_SUCCESS; -} - -#endif \ No newline at end of file diff --git a/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h deleted file mode 100644 index fbef4f9a2..000000000 --- a/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h +++ /dev/null @@ -1,162 +0,0 @@ -/*************************************************************************** - tnlCusparseCSR.h - description - ------------------- - begin : Jul 3, 2014 - copyright : (C) 2014 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#ifdef NOT_USED_ANYMORE - -#include -#include -#ifdef HAVE_CUDA -#include -#endif - -namespace TNL { - -template< typename Real > -class CusparseCSRBase -{ - public: - typedef Real RealType; - typedef Devices::Cuda DeviceType; - typedef Matrices::CSR< RealType, Devices::Cuda, int > MatrixType; - - CusparseCSRBase() - : matrix( 0 ) - { - }; - -#ifdef HAVE_CUDA - void init( const MatrixType& matrix, - cusparseHandle_t* cusparseHandle ) - { - this->matrix = &matrix; - this->cusparseHandle = cusparseHandle; - cusparseCreateMatDescr( & this->matrixDescriptor ); - }; -#endif - - int getRows() const - { - return matrix->getRows(); - } - - int getColumns() const - { - return matrix->getColumns(); - } - - int getNumberOfMatrixElements() const - { - return matrix->getNumberOfMatrixElements(); - } - - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const - { - TNL_ASSERT_TRUE( matrix, "matrix was not initialized" ); -#ifdef HAVE_CUDA - cusparseDcsrmv( *( this->cusparseHandle ), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->matrix->getRows(), - this->matrix->getColumns(), - this->matrix->values.getSize(), - 1.0, - this->matrixDescriptor, - this->matrix->values.getData(), - this->matrix->rowPointers.getData(), - this->matrix->columnIndexes.getData(), - inVector.getData(), - 1.0, - outVector.getData() ); -#endif - } - - protected: - - const MatrixType* matrix; -#ifdef HAVE_CUDA - cusparseHandle_t* cusparseHandle; - - cusparseMatDescr_t matrixDescriptor; -#endif -}; - - -template< typename Real > -class CusparseCSR -{}; - -template<> -class CusparseCSR< double > : public CusparseCSRBase< double > -{ - public: - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const - { - TNL_ASSERT_TRUE( matrix, "matrix was not initialized" ); -#ifdef HAVE_CUDA - double d = 1.0; - double* alpha = &d; - cusparseDcsrmv( *( this->cusparseHandle ), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->matrix->getRows(), - this->matrix->getColumns(), - this->matrix->getValues().getSize(), - alpha, - this->matrixDescriptor, - this->matrix->getValues().getData(), - this->matrix->getRowPointers().getData(), - this->matrix->getColumnIndexes().getData(), - inVector.getData(), - alpha, - outVector.getData() ); -#endif - } -}; - -template<> -class CusparseCSR< float > : public CusparseCSRBase< float > -{ - public: - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const - { - TNL_ASSERT_TRUE( matrix, "matrix was not initialized" ); -#ifdef HAVE_CUDA - float d = 1.0; - float* alpha = &d; - cusparseScsrmv( *( this->cusparseHandle ), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->matrix->getRows(), - this->matrix->getColumns(), - this->matrix->getValues().getSize(), - alpha, - this->matrixDescriptor, - this->matrix->getValues().getData(), - this->matrix->getRowPointers().getData(), - this->matrix->getColumnIndexes().getData(), - inVector.getData(), - alpha, - outVector.getData() ); -#endif - } -}; - -} // namespace TNL - -#endif \ No newline at end of file -- GitLab From cf8c88697054dbce7d85f1a2013648468adc8e94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 7 Aug 2020 17:56:23 +0200 Subject: [PATCH 56/57] Legacy sparse matrix formats moved to benchmarks. --- src/Benchmarks/BLAS/spmv.h | 6 +++--- .../SpMV/ReferenceFormats}/Legacy/BiEllpack.h | 4 ++-- .../SpMV/ReferenceFormats}/Legacy/BiEllpack_impl.h | 2 +- .../SpMV/ReferenceFormats}/Legacy/ChunkedEllpack.h | 4 ++-- .../ReferenceFormats}/Legacy/ChunkedEllpack_impl.h | 2 +- .../SpMV/ReferenceFormats}/Legacy/Ellpack.h | 4 ++-- .../SpMV/ReferenceFormats}/Legacy/Ellpack_impl.h | 2 +- .../SpMV/ReferenceFormats}/Legacy/SlicedEllpack.h | 4 ++-- .../SpMV/ReferenceFormats}/Legacy/SlicedEllpack_impl.h | 2 +- .../SpMV/ReferenceFormats}/Legacy/Sparse.h | 4 ++-- .../SpMV/ReferenceFormats}/Legacy/SparseRow.h | 2 +- .../SpMV/ReferenceFormats}/Legacy/SparseRow_impl.h | 2 +- .../SpMV/ReferenceFormats}/Legacy/Sparse_impl.h | 0 .../SpMV/{ => ReferenceFormats}/cusparseCSRMatrix.h | 0 src/Benchmarks/SpMV/spmv-legacy.h | 10 +++++----- src/TNL/Matrices/Legacy/AdEllpack.h | 2 +- src/TNL/Matrices/Legacy/CSR.h | 2 +- src/TNL/Matrices/MatrixInfo.h | 8 ++++---- src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h | 4 ++-- src/UnitTests/Matrices/DenseMatrixCopyTest.h | 4 ++-- src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp | 4 ++-- .../Matrices/Legacy/SparseMatrixTest_BiEllpack.h | 2 +- .../Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h | 2 +- .../Matrices/Legacy/SparseMatrixTest_Ellpack.h | 2 +- .../Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h | 2 +- src/UnitTests/Matrices/SparseMatrixCopyTest.h | 4 ++-- 26 files changed, 42 insertions(+), 42 deletions(-) rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/BiEllpack.h (98%) rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/BiEllpack_impl.h (99%) rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/ChunkedEllpack.h (98%) rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/ChunkedEllpack_impl.h (99%) rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/Ellpack.h (98%) rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/Ellpack_impl.h (99%) rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/SlicedEllpack.h (98%) rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/SlicedEllpack_impl.h (99%) rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/Sparse.h (93%) rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/SparseRow.h (97%) rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/SparseRow_impl.h (98%) rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/Sparse_impl.h (100%) rename src/Benchmarks/SpMV/{ => ReferenceFormats}/cusparseCSRMatrix.h (100%) diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h index c013e6bfe..85cb4b731 100644 --- a/src/Benchmarks/BLAS/spmv.h +++ b/src/Benchmarks/BLAS/spmv.h @@ -16,9 +16,9 @@ #include #include -#include -#include -#include +#include +#include +#include namespace TNL { namespace Benchmarks { diff --git a/src/TNL/Matrices/Legacy/BiEllpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h similarity index 98% rename from src/TNL/Matrices/Legacy/BiEllpack.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h index 3f7b06a58..dd173cea1 100644 --- a/src/TNL/Matrices/Legacy/BiEllpack.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h @@ -18,7 +18,7 @@ #pragma once -#include +#include #include namespace TNL { @@ -221,5 +221,5 @@ private: } //namespace Matrices } // namespace TNL -#include +#include diff --git a/src/TNL/Matrices/Legacy/BiEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h similarity index 99% rename from src/TNL/Matrices/Legacy/BiEllpack_impl.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h index 1bb393bb9..afda8c2a5 100644 --- a/src/TNL/Matrices/Legacy/BiEllpack_impl.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h @@ -11,7 +11,7 @@ #pragma once -#include +#include #include #include #include diff --git a/src/TNL/Matrices/Legacy/ChunkedEllpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h similarity index 98% rename from src/TNL/Matrices/Legacy/ChunkedEllpack.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h index 93ba63ebf..10fce9f71 100644 --- a/src/TNL/Matrices/Legacy/ChunkedEllpack.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h @@ -22,7 +22,7 @@ #pragma once -#include +#include #include namespace TNL { @@ -354,5 +354,5 @@ protected: } // namespace Matrices } // namespace TNL -#include +#include diff --git a/src/TNL/Matrices/Legacy/ChunkedEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h similarity index 99% rename from src/TNL/Matrices/Legacy/ChunkedEllpack_impl.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h index ec05515fd..99c3ef547 100644 --- a/src/TNL/Matrices/Legacy/ChunkedEllpack_impl.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h @@ -10,7 +10,7 @@ #pragma once -#include +#include #include #include #include diff --git a/src/TNL/Matrices/Legacy/Ellpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h similarity index 98% rename from src/TNL/Matrices/Legacy/Ellpack.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h index af730ccd2..7ddb4bb04 100644 --- a/src/TNL/Matrices/Legacy/Ellpack.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h @@ -10,7 +10,7 @@ #pragma once -#include +#include #include namespace TNL { @@ -212,4 +212,4 @@ protected: } // namespace Matrices } // namespace TNL -#include +#include diff --git a/src/TNL/Matrices/Legacy/Ellpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h similarity index 99% rename from src/TNL/Matrices/Legacy/Ellpack_impl.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h index 39e27f8f9..1ca524701 100644 --- a/src/TNL/Matrices/Legacy/Ellpack_impl.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h @@ -10,7 +10,7 @@ #pragma once -#include +#include #include #include #include diff --git a/src/TNL/Matrices/Legacy/SlicedEllpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h similarity index 98% rename from src/TNL/Matrices/Legacy/SlicedEllpack.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h index 88ab6ae32..e0bcd3c75 100644 --- a/src/TNL/Matrices/Legacy/SlicedEllpack.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h @@ -21,7 +21,7 @@ #pragma once -#include +#include #include namespace TNL { @@ -240,4 +240,4 @@ public: } // namespace Matrices } // namespace TNL -#include +#include diff --git a/src/TNL/Matrices/Legacy/SlicedEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h similarity index 99% rename from src/TNL/Matrices/Legacy/SlicedEllpack_impl.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h index fa99206e2..6bd8b87aa 100644 --- a/src/TNL/Matrices/Legacy/SlicedEllpack_impl.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h @@ -10,7 +10,7 @@ #pragma once -#include +#include #include #include #include diff --git a/src/TNL/Matrices/Legacy/Sparse.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h similarity index 93% rename from src/TNL/Matrices/Legacy/Sparse.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h index 275c7a9bc..5f75efe18 100644 --- a/src/TNL/Matrices/Legacy/Sparse.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h @@ -11,7 +11,7 @@ #pragma once #include -#include +#include namespace TNL { namespace Matrices { @@ -66,5 +66,5 @@ class Sparse : public Matrix< Real, Device, Index > } // namespace Matrices } // namespace TNL -#include +#include #include diff --git a/src/TNL/Matrices/Legacy/SparseRow.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h similarity index 97% rename from src/TNL/Matrices/Legacy/SparseRow.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h index eb7a461fb..0b5ff29d9 100644 --- a/src/TNL/Matrices/Legacy/SparseRow.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h @@ -100,4 +100,4 @@ std::ostream& operator<<( std::ostream& str, const SparseRow< Real, Index >& row } // namespace Matrices } // namespace TNL -#include +#include diff --git a/src/TNL/Matrices/Legacy/SparseRow_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h similarity index 98% rename from src/TNL/Matrices/Legacy/SparseRow_impl.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h index e34f3a847..f538bbb86 100644 --- a/src/TNL/Matrices/Legacy/SparseRow_impl.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h @@ -10,7 +10,7 @@ #pragma once -#include +#include #include // Following includes are here to enable usage of std::vector and std::cout. To avoid having to include Device type (HOW would this be done anyway) diff --git a/src/TNL/Matrices/Legacy/Sparse_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h similarity index 100% rename from src/TNL/Matrices/Legacy/Sparse_impl.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h diff --git a/src/Benchmarks/SpMV/cusparseCSRMatrix.h b/src/Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h similarity index 100% rename from src/Benchmarks/SpMV/cusparseCSRMatrix.h rename to src/Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h index 838165039..ec0fd0018 100644 --- a/src/Benchmarks/SpMV/spmv-legacy.h +++ b/src/Benchmarks/SpMV/spmv-legacy.h @@ -19,11 +19,11 @@ #include #include -#include -#include -#include +#include +#include +#include #include -#include +#include #include #include @@ -37,7 +37,7 @@ #include using namespace TNL::Matrices; -#include "cusparseCSRMatrix.h" +#include namespace TNL { namespace Benchmarks { diff --git a/src/TNL/Matrices/Legacy/AdEllpack.h b/src/TNL/Matrices/Legacy/AdEllpack.h index 260bdc4ac..f1a023007 100644 --- a/src/TNL/Matrices/Legacy/AdEllpack.h +++ b/src/TNL/Matrices/Legacy/AdEllpack.h @@ -18,7 +18,7 @@ #pragma once -#include +#include #include namespace TNL { diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 818e51883..d7a9092cf 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -10,7 +10,7 @@ #pragma once -#include +#include #include #include diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h index 432584d27..2715d2f6e 100644 --- a/src/TNL/Matrices/MatrixInfo.h +++ b/src/TNL/Matrices/MatrixInfo.h @@ -19,10 +19,10 @@ #include #include #include -#include -#include -#include -#include +#include +#include +#include +#include namespace TNL { /** diff --git a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h index 69d427b84..c61f7fda7 100644 --- a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h @@ -9,8 +9,8 @@ /* See Copyright Notice in tnl/Copyright */ #include -#include -#include +#include +#include #include #include diff --git a/src/UnitTests/Matrices/DenseMatrixCopyTest.h b/src/UnitTests/Matrices/DenseMatrixCopyTest.h index 9e63a6f6c..d86eb57f5 100644 --- a/src/UnitTests/Matrices/DenseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/DenseMatrixCopyTest.h @@ -9,8 +9,8 @@ /* See Copyright Notice in tnl/Copyright */ #include -#include -#include +#include +#include #include #include diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp index df6f4441a..ab67b8374 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp @@ -15,9 +15,9 @@ #include // Temporary, until test_OperatorEquals doesn't work for all formats. -#include +#include #include -#include +#include #ifdef HAVE_GTEST #include diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h index cdac8af6e..d0277e27c 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h @@ -8,7 +8,7 @@ /* See Copyright Notice in tnl/Copyright */ -#include +#include #include "SparseMatrixTest.hpp" #include diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h index d633abdbf..f0ee7c079 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h @@ -8,7 +8,7 @@ /* See Copyright Notice in tnl/Copyright */ -#include +#include #include "SparseMatrixTest.hpp" #include diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h index dd86d6316..8376654cd 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h @@ -8,7 +8,7 @@ /* See Copyright Notice in tnl/Copyright */ -#include +#include #include "SparseMatrixTest.hpp" #include diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h index 168f482ea..9ffba7504 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h @@ -8,7 +8,7 @@ /* See Copyright Notice in tnl/Copyright */ -#include +#include #include "SparseMatrixTest.hpp" diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h index dcaca61f0..f5bdd7e3f 100644 --- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h @@ -9,8 +9,8 @@ /* See Copyright Notice in tnl/Copyright */ #include -#include -#include +#include +#include #include #include -- GitLab From 033549c4917105e66dc85cd7131cb1ec0609bff7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 10 Aug 2020 09:15:27 +0200 Subject: [PATCH 57/57] Fixed sparse matrix headers including in PyTNL. --- src/Python/pytnl/tnl/SparseMatrix.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Python/pytnl/tnl/SparseMatrix.cpp b/src/Python/pytnl/tnl/SparseMatrix.cpp index f4b1772a7..b5e99c275 100644 --- a/src/Python/pytnl/tnl/SparseMatrix.cpp +++ b/src/Python/pytnl/tnl/SparseMatrix.cpp @@ -4,8 +4,8 @@ #include "SparseMatrix.h" #include -#include -#include +#include +#include using CSR_host = TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >; using CSR_cuda = TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >; -- GitLab