Commit 8c1c80e2 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

[WIP] New lambda function based reduction compiles but does not work in CUDA.

parent 5c983e49
Loading
Loading
Loading
Loading
+0 −2
Original line number Diff line number Diff line
@@ -86,8 +86,6 @@ struct ArrayAssignment< Array, T, false >

};



} // namespace Algorithms
} // namespace Containers
} // namespace TNL
+26 −8
Original line number Diff line number Diff line
@@ -205,8 +205,19 @@ compareMemory( const Element1* destination,
{
   TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
   TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
   Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
   return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );

   Element1* d;
   cudaMalloc( ( void** ) &d, size * sizeof( Element1 ) );
   auto fetch = [=] __cuda_callable__ ( Index i ) { return  d[ 0 ]; }; //( destination[ i ] == source[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
   return Reduction< Devices::Cuda >::reduce(
      size,
      reduction, //[=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; },
      fetch, //[=] __cuda_callable__ ( Index i ) { return  destination[ i ]; },
      true );

   /*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
   return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/
}

template< typename Element,
@@ -219,11 +230,11 @@ containsValue( const Element* data,
{
   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
   TNL_ASSERT_GE( size, 0, "" );

   if( size == 0 ) return false;
   //Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
   //reductionContainsValue.setValue( value );
   auto fetch = [=] __cuda_callable__ ( Index i ) { return  ( data[ i ] == value ); };
   return Reduction< Devices::Cuda >::reduce( size, TNL::sum, fetch, 0.0  );
   auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a || b; };
   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false );
}

template< typename Element,
@@ -237,9 +248,16 @@ containsOnlyValue( const Element* data,
   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
   TNL_ASSERT_GE( size, 0, "" );
   if( size == 0 ) return false;
   Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;

   if( size == 0 ) return false;
   auto fetch = [=] __cuda_callable__ ( Index i ) { return  ( data[ i ] == value ); };
   auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );

   
   /*Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;
   reductionContainsOnlyValue.setValue( value );
   return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );
   return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );*/
}


+79 −0
Original line number Diff line number Diff line
/***************************************************************************
                          CommonVectorOperations.h  -  description
                             -------------------
    begin                : Apr 12, 2019
    copyright            : (C) 2019 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

namespace TNL {
namespace Containers {
namespace Algorithms {

template< typename Device >
struct CommonVectorOperations
{
   using DeviceType = Device;
   
   template< typename Vector, typename ResultType = typename Vector::RealType >
   static ResultType getVectorMax( const Vector& v );

   template< typename Vector, typename ResultType = typename Vector::RealType >
   static ResultType getVectorMin( const Vector& v );

   template< typename Vector, typename ResultType = typename Vector::RealType >
   static ResultType getVectorAbsMax( const Vector& v );

   template< typename Vector, typename ResultType = typename Vector::RealType >
   static ResultType getVectorAbsMin( const Vector& v );

   template< typename Vector, typename ResultType = typename Vector::RealType >
   static ResultType getVectorL1Norm( const Vector& v );

   template< typename Vector, typename ResultType = typename Vector::RealType >
   static ResultType getVectorL2Norm( const Vector& v );

   template< typename Vector, typename ResultType = typename Vector::RealType, typename Scalar >
   static ResultType getVectorLpNorm( const Vector& v, const Scalar p );

   template< typename Vector, typename ResultType = typename Vector::RealType >
   static ResultType getVectorSum( const Vector& v );

   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
   static ResultType getVectorDifferenceMax( const Vector1& v1, const Vector2& v2 );

   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
   static ResultType getVectorDifferenceMin( const Vector1& v1, const Vector2& v2 );

   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
   static ResultType getVectorDifferenceAbsMax( const Vector1& v1, const Vector2& v2 );

   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
   static ResultType getVectorDifferenceAbsMin( const Vector1& v1, const Vector2& v2 );

   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
   static ResultType getVectorDifferenceL1Norm( const Vector1& v1, const Vector2& v2 );

   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
   static ResultType getVectorDifferenceL2Norm( const Vector1& v1, const Vector2& v2 );

   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType, typename Scalar >
   static ResultType getVectorDifferenceLpNorm( const Vector1& v1, const Vector2& v2, const Scalar p );

   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
   static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 );

   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
   static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );

};

} // namespace Algorithms
} // namespace Containers
} // namespace TNL

#include <TNL/Containers/Algorithms/CommonVectorOperations.hpp>
+29 −33
Original line number Diff line number Diff line
@@ -40,20 +40,18 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi
#endif

template< int blockSize,
   typename Real,
   typename Result,
   typename DataFetcher,
   typename Reduction,
   typename Index,
   typename Result = decltype( std::declval< DataFetcher >( 0,0 ) ) >
   typename Index >
__global__ void
__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
CudaReductionKernel( const Real& zero,
CudaReductionKernel( const Result zero,
                     const DataFetcher& dataFetcher,
                     const Reduction& reduction,
                     const Index size,
                     Result* output )
{
   using RealType = Real;
   using IndexType = Index;
   using ResultType = Result;
   
@@ -69,11 +67,12 @@ CudaReductionKernel( const Real& zero,
   const IndexType gridSize = blockDim.x * gridDim.x;

   sdata[ tid ] = zero;

   /***
    * Read data into the shared memory. We start with the
    * sequential reduction.
    */
   while( gid + 4 * gridSize < size )
   /*while( gid + 4 * gridSize < size )
   {
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
@@ -86,15 +85,14 @@ CudaReductionKernel( const Real& zero,
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
      gid += 2 * gridSize;
   }
   }*/
   while( gid < size )
   {
      sdata[ tid ] =  reduction( sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = dataFetcher( gid ); //reduction( sdata[ tid ], dataFetcher( gid ) );
      gid += gridSize;
   }
   __syncthreads();


   return;
   //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );

   //return;
@@ -104,19 +102,19 @@ CudaReductionKernel( const Real& zero,
   if( blockSize >= 1024 )
   {
      if( tid < 512 )
         reduction( sdata[ tid ], sdata[ tid + 512 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
      __syncthreads();
   }
   if( blockSize >= 512 )
   {
      if( tid < 256 )
         reduction( sdata[ tid ], sdata[ tid + 256 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
      __syncthreads();
   }
   if( blockSize >= 256 )
   {
      if( tid < 128 )
         reduction( sdata[ tid ], sdata[ tid + 128 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
      __syncthreads();
      //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
   }
@@ -124,7 +122,7 @@ CudaReductionKernel( const Real& zero,
   if( blockSize >= 128 )
   {
      if( tid <  64 )
         reduction( sdata[ tid ], sdata[ tid + 64 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
      __syncthreads();
      //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
   }
@@ -138,34 +136,34 @@ CudaReductionKernel( const Real& zero,
      volatile ResultType* vsdata = sdata;
      if( blockSize >= 64 )
      {
         reduction( vsdata[ tid ], vsdata[ tid + 32 ] );
         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 32 ] );
         //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
      }
      // TODO: If blocksize == 32, the following does not work
      // We do not check if tid < 16. Fix it!!!
      if( blockSize >= 32 )
      {
         reduction( vsdata[ tid ], vsdata[ tid + 16 ] );
         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 16 ] );
         //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >= 16 )
      {
         reduction( vsdata[ tid ], vsdata[ tid + 8 ] );
         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 8 ] );
         //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  8 )
      {
         reduction( vsdata[ tid ], vsdata[ tid + 4 ] );
         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 4 ] );
         //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  4 )
      {
         reduction( vsdata[ tid ], vsdata[ tid + 2 ] );
         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 2 ] );
         //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  2 )
      {
         reduction( vsdata[ tid ], vsdata[ tid + 1 ] );
         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 1 ] );
         //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
      }
   }
@@ -181,8 +179,7 @@ CudaReductionKernel( const Real& zero,

}

template< typename Real,
   typename DataFetcher,
template< typename DataFetcher,
   typename Reduction,
   typename Index,
   typename Result >
@@ -190,10 +187,9 @@ int
CudaReductionKernelLauncher( const Index size,
                             const Reduction& reduction,
                             const DataFetcher& dataFetcher,
                             const Real& zero,
                             const Result& zero,
                             Result*& output )
{
   using RealType = Real;
   using IndexType = Index;
   using ResultType = Result;

@@ -236,49 +232,49 @@ CudaReductionKernelLauncher( const Index size,
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case 256:
         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel< 256 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case 128:
         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel< 128 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case  64:
         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel<  64 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case  32:
         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel<  32 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case  16:
         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel<  16 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
     case   8:
         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel<   8 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case   4:
         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel<   4 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case   2:
         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel<   2 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+6 −6
Original line number Diff line number Diff line
@@ -30,9 +30,9 @@ class Reduction< Devices::Cuda >
{
public:
   template< typename Index,
             typename Result,
             typename ReductionOperation,
             typename DataFetcher,
             typename Result = decltype( DataFetcher::operator() ) >
             typename DataFetcher >
   static Result
   reduce( const Index size,
           ReductionOperation& reduction,
@@ -45,9 +45,9 @@ class Reduction< Devices::Host >
{
public:
   template< typename Index,
             typename Result,
             typename ReductionOperation,
             typename DataFetcher,
             typename Result = decltype( DataFetcher::operator() ) >
             typename DataFetcher >
   static Result
   reduce( const Index size,
           ReductionOperation& reduction,
@@ -60,9 +60,9 @@ class Reduction< Devices::MIC >
{
public:
   template< typename Index,
             typename Result,
             typename ReductionOperation,
             typename DataFetcher,
             typename Result = decltype( DataFetcher::operator() ) >
             typename DataFetcher >
   static Result
   reduce( const Index size,
           ReductionOperation& reduction,
Loading