Commit 316819ca authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Debuging CSR Adaptive kernel.

parent 536a6526
Loading
Loading
Loading
Loading
+318 −128
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <TNL/Containers/VectorView.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
#include <TNL/Algorithms/Segments/CSRKernelScalar.h>

namespace TNL {
   namespace Algorithms {
@@ -28,14 +29,17 @@ enum class Type {
};

template< typename Index >
union Block {
   Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept {
union Block
{
   Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
   {
      this->index[0] = row;
      this->index[1] = index;
      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
   }

   Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept {
   Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept
   {
      this->index[0] = row;
      this->index[1] = 0;
      this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
@@ -51,93 +55,177 @@ union Block {

   Block() = default;

   Type getType() const
   {
      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
         return Type::STREAM;
      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 )
         return Type::VECTOR;
      return Type::LONG;
   }

   Index getFirstRow() const
   {
      return index[ 0 ];
   }

   Index getRowsInBlock() const
   {
      return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
   }

   void print( std::ostream& str ) const
   {
      Type type = this->getType();
      str << "Type: ";
      switch( type )
      {
         case Type::STREAM:
            str << " Stream ";
            break;
         case Type::VECTOR:
            str << " Vector ";
            break;
         case Type::LONG:
            str << " Long ";
            break;
      }
      str << " first row: " << getFirstRow();
      str << " rows per block: " << getRowsInBlock();
      str << " index in warp: " << index[ 1 ];
   }
   Index index[2]; // index[0] is row pointer, index[1] is index in warp
   uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
   uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
                                                //twobytes[3/5] is nextRow - row
};

template< typename Index >
std::ostream& operator<< ( std::ostream& str, const Block< Index >& block )
{
   block.print( str );
   return str;
}

#ifdef HAVE_CUDA

template< typename Real,
          typename Index,
          int warpSize,
template< int warpSize,
          int WARPS,
          int SHARED_PER_WARP,
          int MAX_ELEM_PER_WARP >
__global__
void SpMVCSRAdaptive( const Real *inVector,
                      Real *outVector,
                      const Index* rowPointers,
                      const Index* columnIndexes,
                      const Real* values,
                      const Block<Index> *blocks,
          int MAX_ELEM_PER_WARP,
          typename Offsets,
          typename Index,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper,
          typename Real,
          typename... Args >
__global__ void
segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks,
                                    Index blocksSize,
                      Index gridID) {
                                    int gridIdx,
                                    Offsets offsets,
                                    Index first,
                                    Index last,
                                    Fetch fetch,
                                    Reduction reduce,
                                    ResultKeeper keep,
                                    Real zero,
                                    Args... args )
{
   __shared__ Real shared[WARPS][SHARED_PER_WARP];
   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
   constexpr size_t MAX_X_DIM = 2147483647;
   const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
   const Index blockIdx = index / warpSize;
   if (blockIdx >= blocksSize)
      return;

   Real result = 0.0;
   Real result = zero;
   bool compute( true );
   const Index laneID = threadIdx.x & 31; // & is cheaper than %
   Block<Index> block = blocks[blockIdx];
   const Index minID = rowPointers[block.index[0]/* minRow */];
   const Index minID = offsets[block.index[0]/* minRow */];
   Index i, to, maxID;
   if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) {
      /////////////////////////////////////* CSR STREAM *//////////////

   if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000)
   {
      /****
       * CSR Stream: Copy first all data into shared memory
       */

      const Index warpID = threadIdx.x / 32;
      maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];

      /* Stream data to shared memory */
      for (i = laneID + minID; i < maxID; i += warpSize)
         shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]];
      for( Index globalIdx = laneID + minID; globalIdx < maxID; globalIdx += warpSize )
      {
         shared[warpID][i - minID] = //fetch( globalIdx, compute );
            details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
         printf( "Stream: Fetch at %d -> %f \n", globalIdx, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ) );
            // TODO:: fix this
         //values[i] * inVector[columnIndexes[i]];
      }

      const Index maxRow = block.index[0]/* minRow */ +
         /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
      /* Calculate result */
      for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
         to = rowPointers[i + 1] - minID; // end of preprocessed data
         result = 0;
      for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize)
      {
         to = offsets[i + 1] - minID; // end of preprocessed data
         result = zero;
         /* Scalar reduction */
         for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID)
            result += shared[warpID][sharedID];
         for( Index sharedID = offsets[ i ] - minID; sharedID < to; ++sharedID)
            result = reduce( result, shared[warpID][sharedID] );

         outVector[i] = result; // Write result
         printf( "Stream: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result );
         keep( i, result );
         //outVector[i] = result; // Write result
      }
   } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) {
   }
   else //if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000)
   {
      printf( "Vector: threadIdx = %d \n", threadIdx );
      /////////////////////////////////////* CSR VECTOR *//////////////
      maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
      const Index segmentIdx = block.index[0];

      for (i = minID + laneID; i < maxID; i += warpSize)
         result += values[i] * inVector[columnIndexes[i]];
      for( Index globalIdx = minID + laneID; globalIdx < maxID; globalIdx += warpSize )
         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx
         //values[i] * inVector[columnIndexes[i]];

      /* Parallel reduction */
      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
      result += __shfl_down_sync(0xFFFFFFFF, result, 1);
      if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
   } else {
      /////////////////////////////////////* CSR VECTOR L */////////////
      /* Number of elements processed by previous warps */
      const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP;
      to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP;
      maxID = rowPointers[block.index[0]/* minRow */ + 1];
      if (to > maxID) to = maxID;
      for (i = minID + offset + laneID; i < to; i += warpSize)
         result += values[i] * inVector[columnIndexes[i]];
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) );
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  8 ) );
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  4 ) );
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  2 ) );
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  1 ) );
      if( laneID == 0 )
      {
         printf( "Vector: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result );
         keep( segmentIdx, result );
          //outVector[block.index[0]/* minRow */] = result; // Write result
      }
   }/*
   else
   {
      ///////////////////////////////////// CSR VECTOR L /////////////
      // Number of elements processed by previous warps
      const Index offset = block.index[1] * MAX_ELEM_PER_WARP;
      to = minID + (block.index[1]  + 1) * MAX_ELEM_PER_WARP;
      maxID = offsets[block.index[0] + 1];
      if( to > maxID )
         to = maxID;
      for( Index globalIdx = minID + offset + laneID; globalIdx < to; globalIdx += warpSize )
         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
         //result += values[i] * inVector[columnIndexes[i]];

      /* Parallel reduction */
      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
      result += __shfl_down_sync(0xFFFFFFFF, result, 1);
      if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result);
   }
      if (laneID == 0) atomicAdd(&outVector[block.index[0] ], result);
   }*/
}
#endif

@@ -150,6 +238,20 @@ struct CSRKernelAdaptiveView
   using DeviceType = Device;
   using ViewType = CSRKernelAdaptiveView< Index, Device >;
   using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
   using BlocksType = TNL::Containers::Vector< Block< Index >, Device, Index >;
   using BlocksView = typename BlocksType::ViewType;

   CSRKernelAdaptiveView() = default;

   CSRKernelAdaptiveView( BlocksType& blocks )
   {
      this->blocks.bind( blocks );
   };

   void setBlocks( BlocksType& blocks )
   {
      this->blocks.bind( blocks );
   }

   ViewType getView() { return *this; };

@@ -170,38 +272,102 @@ struct CSRKernelAdaptiveView
                        const Real& zero,
                        Args... args ) const
   {
#ifdef HAVE_CUDA
      if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() )
      {
         TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >::
            segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
         return;
      }

      this->printBlocks();
      static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
      //static constexpr Index THREADS_SCALAR = 128;
      static constexpr Index THREADS_VECTOR = 128;
      static constexpr Index THREADS_LIGHT = 128;

      /* Max length of row to process one warp for CSR Light, MultiVector */
      static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

            Index blocks;
   const Index threads = matrix.THREADS_ADAPTIVE;
      /* Max length of row to process one warp for CSR Adaptive */
      static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;

      /* How many shared memory use per block in CSR Adaptive kernel */
      static constexpr Index SHARED_PER_BLOCK = 24576;

      /* Number of elements in shared memory */
      static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);

      /* Number of warps in block for CSR Adaptive */
      static constexpr Index WARPS = THREADS_ADAPTIVE / 32;

      /* Number of elements in shared memory per one warp */
      static constexpr Index SHARED_PER_WARP = SHARED / WARPS;

      constexpr int warpSize = 32;

      Index blocksCount;

      const Index threads = THREADS_ADAPTIVE;
      constexpr size_t MAX_X_DIM = 2147483647;

      /* Fill blocks */
   size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block
      size_t neededThreads = blocks.getSize() * warpSize; // one warp per block
      /* Execute kernels on device */
   for (Index grid = 0; neededThreads != 0; ++grid) {
      if (MAX_X_DIM * threads >= neededThreads) {
         blocks = roundUpDivision(neededThreads, threads);
      for (Index gridIdx = 0; neededThreads != 0; gridIdx++ )
      {
         if (MAX_X_DIM * threads >= neededThreads)
         {
            blocksCount = roundUpDivision(neededThreads, threads);
            neededThreads = 0;
      } else {
         blocks = MAX_X_DIM;
         }
         else
         {
            blocksCount = MAX_X_DIM;
            neededThreads -= MAX_X_DIM * threads;
         }

      SpMVCSRAdaptive< Real, Index, warpSize,
            matrix.WARPS,
            matrix.SHARED_PER_WARP, 
            matrix.MAX_ELEMENTS_PER_WARP_ADAPT >
         <<<blocks, threads>>>(
               inVector,
               outVector,
               matrix.getRowPointers().getData(),
               matrix.getColumnIndexes().getData(),
               matrix.getValues().getData(),
               matrix.blocks.getData(),
               matrix.blocks.getSize() - 1, // last block shouldn't be used
               grid
      );
         segmentsReductionCSRAdaptiveKernel<
               warpSize,
               WARPS,
               SHARED_PER_WARP,
               MAX_ELEMENTS_PER_WARP_ADAPT,
               OffsetsView,
               Index, Fetch, Reduction, ResultKeeper, Real, Args... >
            <<<blocksCount, threads>>>(
               blocks.getData(),
               blocks.getSize() - 1, // last block shouldn't be used
               gridIdx,
               offsets,
               first,
               last,
               fetch,
               reduction,
               keeper,
               zero,
               args... );
      }
#endif
   }

   CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView )
   {
      this->blocks.bind( kernelView.blocks );
      return *this;
   }

   void printBlocks() const
   {
      for( Index i = 0; i < this->blocks.getSize(); i++ )
      {
         auto block = blocks.getElement( i );
         std::cout << "Block " << i << " : " << block << std::endl;
      }

   }

   protected:
      BlocksView blocks;
};

template< typename Index,
@@ -212,6 +378,9 @@ struct CSRKernelAdaptive
    using DeviceType = Device;
    using ViewType = CSRKernelAdaptiveView< Index, Device >;
    using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
    using BlocksType = typename ViewType::BlocksType;
    using BlocksView = typename BlocksType::ViewType;


    static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;

@@ -227,23 +396,35 @@ struct CSRKernelAdaptive
   /* Number of elements in shared memory per one warp */
   static constexpr Index SHARED_PER_WARP = SHARED / WARPS;

   /* Max length of row to process one warp for CSR Light, MultiVector */
   static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

   /* Max length of row to process one warp for CSR Adaptive */
   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;

   template< typename Offsets >
   Index findLimit(const Index start,
                const Offsets& offsets,
                const Index size,
                Type &type,
                Index &sum) {
                Index &sum)
   {
      sum = 0;
    for (Index current = start; current < size - 1; ++current) {
      for (Index current = start; current < size - 1; ++current)
      {
         Index elements = offsets.getElement(current + 1) -
                           offsets.getElement(current);
         sum += elements;
        if (sum > matrix.SHARED_PER_WARP) {
            if (current - start > 0) { // extra row
         if (sum >SHARED_PER_WARP)
         {
            if (current - start > 0)
            { // extra row
               type = Type::STREAM;
               return current;
            } else {                  // one long row
                if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT)
            }
            else
            {                  // one long row
               if (sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT)
               type = Type::VECTOR;
               else
               type = Type::LONG;
@@ -251,7 +432,6 @@ struct CSRKernelAdaptive
            }
         }
      }

      type = Type::STREAM;
      return size - 1; // return last row pointer
    }
@@ -269,8 +449,7 @@ struct CSRKernelAdaptive
        while (nextStart != rows - 1)
        {
            Type type;
            nextStart = findLimit<Real, Index, Device, KernelType>(
                start, *this, rows, type, sum );
            nextStart = findLimit( start, offsets, rows, type, sum );

            if (type == Type::LONG)
            {
@@ -284,8 +463,8 @@ struct CSRKernelAdaptive
            {
                inBlock.emplace_back(start, type,
                    nextStart,
                    this->rowPointers.getElement(nextStart),
                    this->rowPointers.getElement(start) );
                    offsets.getElement(nextStart),
                    offsets.getElement(start) );
            }
            start = nextStart;
        }
@@ -295,11 +474,19 @@ struct CSRKernelAdaptive
        this->blocks.setSize(inBlock.size());
        for (size_t i = 0; i < inBlock.size(); ++i)
            this->blocks.setElement(i, inBlock[i]);

         this->view.setBlocks( blocks );
    };

    ViewType getView() { return view; };
   void reset()
   {
      this->blocks.reset();
      this->view.setBlocks( blocks );
   }

   ViewType getView() { return this->view; };

    ConstViewType getConstView() const { return ConstViewType(); };
   ConstViewType getConstView() const { return this->view; };

   template< typename OffsetsView,
              typename Fetch,
@@ -319,6 +506,9 @@ struct CSRKernelAdaptive
      view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
   }

   protected:
      BlocksType blocks;

      ViewType view;
};

+2 −2
Original line number Diff line number Diff line
/***************************************************************************
                          SparseMatrixTest_CSRHybrid.cpp -  description
                          SparseMatrixTest_CSRAdaptive.cpp -  description
                             -------------------
    begin                : Jan 23, 2021
    copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -8,4 +8,4 @@

/* See Copyright Notice in tnl/Copyright */

#include "SparseMatrixTest_CSRHybrid.h"
#include "SparseMatrixTest_CSRAdaptive.h"
+2 −2
Original line number Diff line number Diff line
/***************************************************************************
                          SparseMatrixTest_CSRHybrid.cu -  description
                          SparseMatrixTest_CSRAdaptive.cu -  description
                             -------------------
    begin                : Jan 23, 2021
    copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -8,4 +8,4 @@

/* See Copyright Notice in tnl/Copyright */

#include "SparseMatrixTest_CSRHybrid.h"
#include "SparseMatrixTest_CSRAdaptive.h"
+18 −18
Original line number Diff line number Diff line
/***************************************************************************
                          SparseMatrixTest_CSRHybrid.h -  description
                          SparseMatrixTest_CSRAdaptive.h -  description
                             -------------------
    begin                : Jan 23, 2021
    copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -15,28 +15,28 @@
#ifdef HAVE_GTEST
#include <gtest/gtest.h>

const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments";
const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRAdaptive_segments";

// types for which MatrixTest is instantiated
using MatrixTypes = ::testing::Types
<
    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
#ifdef HAVE_CUDA
   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    //TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    //TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    //TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    //TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
#endif
>;