Skip to content
Snippets Groups Projects
Commit 536a6526 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Adding Adaptive CSR kernel.

parent 60161718
No related branches found
No related tags found
1 merge request!83To/matrices
......@@ -145,6 +145,11 @@ template< typename Device,
typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
using CSRHybrid = CSR< Device, Index, CSRKernelHybrid< Index, Device >, IndexAllocator >;
template< typename Device,
typename Index,
typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
using CSRAdaptive = CSR< Device, Index, CSRKernelAdaptive< Index, Device >, IndexAllocator >;
template< typename Device,
typename Index,
typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
......
......@@ -20,15 +20,136 @@ namespace TNL {
namespace Algorithms {
namespace Segments {
enum class Type {
/* LONG = 0!!! Non zero value rewrites index[1] */
LONG = 0,
STREAM = 1,
VECTOR = 2
};
template<typename Index>
union Block {
Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept {
this->index[0] = row;
this->index[1] = index;
this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
}
Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept {
this->index[0] = row;
this->index[1] = 0;
this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
if (type == Type::STREAM)
this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;
if (type == Type::STREAM)
this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000;
else if (type == Type::VECTOR)
this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;
}
Block() = default;
Index index[2]; // index[0] is row pointer, index[1] is index in warp
uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
//twobytes[3/5] is nextRow - row
};
#ifdef HAVE_CUDA
template< typename Real,
typename Index,
int warpSize,
int WARPS,
int SHARED_PER_WARP,
int MAX_ELEM_PER_WARP >
__global__
void SpMVCSRAdaptive( const Real *inVector,
Real *outVector,
const Index* rowPointers,
const Index* columnIndexes,
const Real* values,
const Block<Index> *blocks,
Index blocksSize,
Index gridID) {
__shared__ Real shared[WARPS][SHARED_PER_WARP];
const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
const Index blockIdx = index / warpSize;
if (blockIdx >= blocksSize)
return;
Real result = 0.0;
const Index laneID = threadIdx.x & 31; // & is cheaper than %
Block<Index> block = blocks[blockIdx];
const Index minID = rowPointers[block.index[0]/* minRow */];
Index i, to, maxID;
if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) {
/////////////////////////////////////* CSR STREAM *//////////////
const Index warpID = threadIdx.x / 32;
maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
/* Stream data to shared memory */
for (i = laneID + minID; i < maxID; i += warpSize)
shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]];
const Index maxRow = block.index[0]/* minRow */ +
/* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
/* Calculate result */
for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
to = rowPointers[i + 1] - minID; // end of preprocessed data
result = 0;
/* Scalar reduction */
for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID)
result += shared[warpID][sharedID];
outVector[i] = result; // Write result
}
} else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) {
/////////////////////////////////////* CSR VECTOR *//////////////
maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
for (i = minID + laneID; i < maxID; i += warpSize)
result += values[i] * inVector[columnIndexes[i]];
/* Parallel reduction */
result += __shfl_down_sync(0xFFFFFFFF, result, 16);
result += __shfl_down_sync(0xFFFFFFFF, result, 8);
result += __shfl_down_sync(0xFFFFFFFF, result, 4);
result += __shfl_down_sync(0xFFFFFFFF, result, 2);
result += __shfl_down_sync(0xFFFFFFFF, result, 1);
if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
} else {
/////////////////////////////////////* CSR VECTOR L */////////////
/* Number of elements processed by previous warps */
const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP;
to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP;
maxID = rowPointers[block.index[0]/* minRow */ + 1];
if (to > maxID) to = maxID;
for (i = minID + offset + laneID; i < to; i += warpSize)
result += values[i] * inVector[columnIndexes[i]];
/* Parallel reduction */
result += __shfl_down_sync(0xFFFFFFFF, result, 16);
result += __shfl_down_sync(0xFFFFFFFF, result, 8);
result += __shfl_down_sync(0xFFFFFFFF, result, 4);
result += __shfl_down_sync(0xFFFFFFFF, result, 2);
result += __shfl_down_sync(0xFFFFFFFF, result, 1);
if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result);
}
}
#endif
template< typename Index,
typename Device >
struct CSRAdaptiveKernelView
struct CSRKernelAdaptiveView
{
using IndexType = Index;
using DeviceType = Device;
using ViewType = CSRAdaptiveKernelView< Index, Device >;
using ConstViewType = CSRAdaptiveKernelView< Index, Device >;
using ViewType = CSRKernelAdaptiveView< Index, Device >;
using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
ViewType getView() { return *this; };
......@@ -49,22 +170,96 @@ struct CSRAdaptiveKernelView
const Real& zero,
Args... args ) const
{
Index blocks;
const Index threads = matrix.THREADS_ADAPTIVE;
/* Fill blocks */
size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block
/* Execute kernels on device */
for (Index grid = 0; neededThreads != 0; ++grid) {
if (MAX_X_DIM * threads >= neededThreads) {
blocks = roundUpDivision(neededThreads, threads);
neededThreads = 0;
} else {
blocks = MAX_X_DIM;
neededThreads -= MAX_X_DIM * threads;
}
SpMVCSRAdaptive< Real, Index, warpSize,
matrix.WARPS,
matrix.SHARED_PER_WARP,
matrix.MAX_ELEMENTS_PER_WARP_ADAPT >
<<<blocks, threads>>>(
inVector,
outVector,
matrix.getRowPointers().getData(),
matrix.getColumnIndexes().getData(),
matrix.getValues().getData(),
matrix.blocks.getData(),
matrix.blocks.getSize() - 1, // last block shouldn't be used
grid
);
}
}
};
template< typename Index,
typename Device >
struct CSRAdaptiveKernel
struct CSRKernelAdaptive
{
using IndexType = Index;
using DeviceType = Device;
using ViewType = CSRAdaptiveKernel< Index, Device >;
using ConstViewType = CSRAdaptiveKernel< Index, Device >;
using ViewType = CSRKernelAdaptiveView< Index, Device >;
using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
/* How many shared memory use per block in CSR Adaptive kernel */
static constexpr Index SHARED_PER_BLOCK = 24576;
/* Number of elements in shared memory */
static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);
/* Number of warps in block for CSR Adaptive */
static constexpr Index WARPS = THREADS_ADAPTIVE / 32;
/* Number of elements in shared memory per one warp */
static constexpr Index SHARED_PER_WARP = SHARED / WARPS;
template< typename Offsets >
Index findLimit(const Index start,
const Offsets& offsets,
const Index size,
Type &type,
Index &sum) {
sum = 0;
for (Index current = start; current < size - 1; ++current) {
Index elements = offsets.getElement(current + 1) -
offsets.getElement(current);
sum += elements;
if (sum > matrix.SHARED_PER_WARP) {
if (current - start > 0) { // extra row
type = Type::STREAM;
return current;
} else { // one long row
if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT)
type = Type::VECTOR;
else
type = Type::LONG;
return current + 1;
}
}
}
type = Type::STREAM;
return size - 1; // return last row pointer
}
template< typename Offsets >
void init( const Offsets& offsets )
{
/*const Index rows = offsets.getSize();
const Index rows = offsets.getSize();
Index sum, start = 0, nextStart = 0;
// Fill blocks
......@@ -100,7 +295,6 @@ struct CSRAdaptiveKernel
this->blocks.setSize(inBlock.size());
for (size_t i = 0; i < inBlock.size(); ++i)
this->blocks.setElement(i, inBlock[i]);
*/
};
ViewType getView() { return view; };
......
......@@ -17,7 +17,7 @@
#include <TNL/Algorithms/Segments/CSRKernelScalar.h>
#include <TNL/Algorithms/Segments/CSRKernelVector.h>
#include <TNL/Algorithms/Segments/CSRKernelHybrid.h>
#include <TNL/Algorithms/Segments/CSRKernels.h>
#include <TNL/Algorithms/Segments/CSRKernelAdaptive.h>
namespace TNL {
namespace Algorithms {
......@@ -144,6 +144,10 @@ template< typename Device,
typename Index >
using CSRViewHybrid = CSRView< Device, Index, CSRKernelHybrid< Index, Device > >;
template< typename Device,
typename Index >
using CSRViewAdaptive = CSRView< Device, Index, CSRKernelAdaptive< Index, Device > >;
template< typename Device,
typename Index >
using CSRViewDefault = CSRViewScalar< Device, Index >;
......
......@@ -9,6 +9,7 @@ set( COMMON_TESTS
SparseMatrixTest_CSRScalar
SparseMatrixTest_CSRVector
SparseMatrixTest_CSRHybrid
SparseMatrixTest_CSRAdaptive
SparseMatrixTest_Ellpack
SparseMatrixTest_SlicedEllpack
SparseMatrixTest_ChunkedEllpack
......
/***************************************************************************
SparseMatrixTest_CSRHybrid.cpp - description
-------------------
begin : Jan 23, 2021
copyright : (C) 2021 by Tomas Oberhuber et al.
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
#include "SparseMatrixTest_CSRHybrid.h"
/***************************************************************************
SparseMatrixTest_CSRHybrid.cu - description
-------------------
begin : Jan 23, 2021
copyright : (C) 2021 by Tomas Oberhuber et al.
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
#include "SparseMatrixTest_CSRHybrid.h"
/***************************************************************************
SparseMatrixTest_CSRHybrid.h - description
-------------------
begin : Jan 23, 2021
copyright : (C) 2021 by Tomas Oberhuber et al.
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
#include <iostream>
#include <TNL/Algorithms/Segments/CSR.h>
#include <TNL/Matrices/SparseMatrix.h>
#ifdef HAVE_GTEST
#include <gtest/gtest.h>
const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments";
// types for which MatrixTest is instantiated
using MatrixTypes = ::testing::Types
<
TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
#ifdef HAVE_CUDA
,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
#endif
>;
#endif
#include "SparseMatrixTest.h"
#include "../main.h"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment