Commit 8b4397ce authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Refactoring the reduction on the CUDA device.

parent 13573312
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -25,10 +25,10 @@ cd Debug
${CMAKE} .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR}
make -j${CPUS} #VERBOSE=1
make -j${CPUS} test
#make -j${CPUS} install
make -j${CPUS} install

cd ../Release
${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR}
make -j${CPUS} #VERBOSE=1
make -j${CPUS} test
#make -j${CPUS} install
make -j${CPUS} install
+1 −2
Original line number Diff line number Diff line
@@ -24,7 +24,6 @@ set (headers tnlArray.h
   		    tnlTimerCPU.h  
   		    tnlTimerRT.h  
   		    tnlTuple.h  
   	       tnlCudaSupport.h 
   		    mfilename.h 
   		    mfuncs.h 
   		    mpi-supp.h 
+4 −1
Original line number Diff line number Diff line
set( headers device-check.h
    )
             reduction.h
             reduction-operations.h )

INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/core/cuda )
 No newline at end of file
+292 −21
Original line number Diff line number Diff line
@@ -22,11 +22,7 @@
#include <cuda.h>
#include <core/mfuncs.h>

enum tnlTupleOperation {  tnlParallelReductionMax,
                          tnlParallelReductionAbsMin,
                          tnlParallelReductionAbsMax,
                          tnlParallelReductionAbsSum,
                          tnlParallelReductionLpNorm,
enum tnlTupleOperation {  tnlParallelReductionLpNorm,
                          tnlParallelReductionSdot };


@@ -109,30 +105,39 @@ class tnlParallelReductionSum

   typedef Real RealType;
   typedef Index IndexType;
   typedef Real ResultType;

   ResultType initialValueOnHost( const IndexType idx,
                                  const RealType* data1,
                                  const RealType* data2 ) const
   {
      return data1[ idx ];
   };

   RealType reduceOnHost( const RealType& data1,
                          const RealType& data2 ) const
   ResultType reduceOnHost( const IndexType idx,
                            const ResultType& current,
                            const RealType* data1,
                            const RealType* data2 ) const
   {
      return data1 + data2;
   }
      return current + data1[ idx ];
   };

   __device__ RealType initialValueOnDevice( const IndexType idx1,
   __device__ ResultType initialValueOnDevice( const IndexType idx1,
                                               const IndexType idx2,
                                               const RealType* data1,
                                               const RealType* data2 ) const
   {
      return data1[ idx1 ] + data1[ idx2 ];
   }
   };

   __device__ RealType initialValueOnDevice( const IndexType idx1,
   __device__ ResultType initialValueOnDevice( const IndexType idx1,
                                               const RealType* data1,
                                               const RealType* data2 ) const
   {
      return data1[ idx1 ];
   };

   __device__ RealType firstReductionOnDevice( const IndexType idx1,
   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
                                                 const IndexType idx2,
                                                 const IndexType idx3,
                                                 const RealType* data1,
@@ -142,7 +147,7 @@ class tnlParallelReductionSum
      return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ];
   };

   __device__ RealType firstReductionOnDevice( const IndexType idx1,
   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
                                                 const IndexType idx2,
                                                 const RealType* data1,
                                                 const RealType* data2,
@@ -151,9 +156,9 @@ class tnlParallelReductionSum
      return data1[ idx1 ] + data2[ idx2 ];
   };

   __device__ RealType commonReductionOnDevice( const IndexType idx1,
   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                const RealType* data ) const
                                                  const ResultType* data ) const
   {
      return data[ idx1 ] + data[ idx2 ];
   };
@@ -167,12 +172,20 @@ class tnlParallelReductionMin
   typedef Real RealType;
   typedef Index IndexType;

   RealType initialValueOnHost( const IndexType idx,
                                const RealType* data1,
                                const RealType* data2 ) const
   {
      return data1[ idx ];
   };

   RealType reduceOnHost( const RealType& data1,
                          const RealType& data2 ) const
   RealType reduceOnHost( const IndexType idx,
                          const RealType& current,
                          const RealType* data1,
                          const RealType* data2 ) const
   {
      return Min( data1, data2 );
   }
      return Min( current, data1[ idx ] );
   };

   __device__ RealType initialValueOnDevice( const IndexType idx1,
                                             const IndexType idx2,
@@ -216,7 +229,265 @@ class tnlParallelReductionMin
   };
};

template< typename Real, typename Index >
class tnlParallelReductionMax
{
   public:

   typedef Real RealType;
   typedef Index IndexType;

   RealType initialValueOnHost( const IndexType idx,
                                const RealType* data1,
                                const RealType* data2 ) const
   {
      return data1[ idx ];
   };

   RealType reduceOnHost( const IndexType idx,
                          const RealType& current,
                          const RealType* data1,
                          const RealType* data2 ) const
   {
      return Max( current, data1[ idx ] );
   };

   __device__ RealType initialValueOnDevice( const IndexType idx1,
                                             const IndexType idx2,
                                             const RealType* data1,
                                             const RealType* data2 ) const
   {
      return tnlCudaMax( data1[ idx1 ], data1[ idx2 ] );
   }

   __device__ RealType initialValueOnDevice( const IndexType idx1,
                                             const RealType* data1,
                                             const RealType* data2 ) const
   {
      return data1[ idx1 ];
   };

   __device__ RealType firstReductionOnDevice( const IndexType idx1,
                                               const IndexType idx2,
                                               const IndexType idx3,
                                               const RealType* data1,
                                               const RealType* data2,
                                               const RealType* data3 ) const
   {
      return tnlCudaMax( data1[ idx1 ], tnlCudaMax( data2[ idx2 ], data2[ idx3 ] ) );
   };

   __device__ RealType firstReductionOnDevice( const IndexType idx1,
                                               const IndexType idx2,
                                               const RealType* data1,
                                               const RealType* data2,
                                               const RealType* data3 ) const
   {
      return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] );
   };

   __device__ RealType commonReductionOnDevice( const IndexType idx1,
                                                const IndexType idx2,
                                                const RealType* data ) const
   {
      return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
   };
};

template< typename Real, typename Index >
class tnlParallelReductionAbsSum
{
   public:

   typedef Real RealType;
   typedef Index IndexType;

   RealType initialValueOnHost( const IndexType idx,
                                const RealType* data1,
                                const RealType* data2 ) const
   {
      return tnlAbs( data1[ idx ] );
   };

   RealType reduceOnHost( const IndexType idx,
                          const RealType& current,
                          const RealType* data1,
                          const RealType* data2 ) const
   {
      return current + tnlAbs( data1[ idx ] );
   };

   __device__ RealType initialValueOnDevice( const IndexType idx1,
                                             const IndexType idx2,
                                             const RealType* data1,
                                             const RealType* data2 ) const
   {
      return tnlCudaAbs( data1[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] );
   };

   __device__ RealType initialValueOnDevice( const IndexType idx1,
                                             const RealType* data1,
                                             const RealType* data2 ) const
   {
      return tnlCudaAbs( data1[ idx1 ] );
   };

   __device__ RealType firstReductionOnDevice( const IndexType idx1,
                                               const IndexType idx2,
                                               const IndexType idx3,
                                               const RealType* data1,
                                               const RealType* data2,
                                               const RealType* data3 ) const
   {
      return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ) + tnlCudaAbs( data2[ idx3 ] );
   };

   __device__ RealType firstReductionOnDevice( const IndexType idx1,
                                               const IndexType idx2,
                                               const RealType* data1,
                                               const RealType* data2,
                                               const RealType* data3 ) const
   {
      return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] );
   };

   __device__ RealType commonReductionOnDevice( const IndexType idx1,
                                                const IndexType idx2,
                                                const RealType* data ) const
   {
      return data[ idx1 ] + data[ idx2 ];
   };
};

template< typename Real, typename Index >
class tnlParallelReductionAbsMin
{
   public:

   typedef Real RealType;
   typedef Index IndexType;

   RealType initialValueOnHost( const IndexType idx,
                                const RealType* data1,
                                const RealType* data2 ) const
   {
      return tnlAbs( data1[ idx ] );
   };

   RealType reduceOnHost( const IndexType idx,
                          const RealType& current,
                          const RealType* data1,
                          const RealType* data2 ) const
   {
      return Min( current, tnlAbs( data1[ idx ] ) );
   };

   __device__ RealType initialValueOnDevice( const IndexType idx1,
                                             const IndexType idx2,
                                             const RealType* data1,
                                             const RealType* data2 ) const
   {
      return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) );
   }

   __device__ RealType initialValueOnDevice( const IndexType idx1,
                                             const RealType* data1,
                                             const RealType* data2 ) const
   {
      return tnlCudaAbs( data1[ idx1 ] );
   };

   __device__ RealType firstReductionOnDevice( const IndexType idx1,
                                               const IndexType idx2,
                                               const IndexType idx3,
                                               const RealType* data1,
                                               const RealType* data2,
                                               const RealType* data3 ) const
   {
      return tnlCudaMin( data1[ idx1 ], tnlCudaMin(  tnlCudaAbs( data2[ idx2 ] ),  tnlCudaAbs( data2[ idx3 ] ) ) );
   };

   __device__ RealType firstReductionOnDevice( const IndexType idx1,
                                               const IndexType idx2,
                                               const RealType* data1,
                                               const RealType* data2,
                                               const RealType* data3 ) const
   {
      return tnlCudaMin( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) );
   };

   __device__ RealType commonReductionOnDevice( const IndexType idx1,
                                                const IndexType idx2,
                                                const RealType* data ) const
   {
      return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
   };
};

template< typename Real, typename Index >
class tnlParallelReductionAbsMax
{
   public:

   typedef Real RealType;
   typedef Index IndexType;

   RealType initialValueOnHost( const IndexType idx,
                                const RealType* data1,
                                const RealType* data2 ) const
   {
      return tnlAbs( data1[ idx ] );
   };

   RealType reduceOnHost( const IndexType idx,
                          const RealType& current,
                          const RealType* data1,
                          const RealType* data2 ) const
   {
      return Max( current, tnlAbs( data1[ idx ] ) );
   };

   __device__ RealType initialValueOnDevice( const IndexType idx1,
                                             const IndexType idx2,
                                             const RealType* data1,
                                             const RealType* data2 ) const
   {
      return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) );
   }

   __device__ RealType initialValueOnDevice( const IndexType idx1,
                                             const RealType* data1,
                                             const RealType* data2 ) const
   {
      return tnlCudaAbs( data1[ idx1 ] );
   };

   __device__ RealType firstReductionOnDevice( const IndexType idx1,
                                               const IndexType idx2,
                                               const IndexType idx3,
                                               const RealType* data1,
                                               const RealType* data2,
                                               const RealType* data3 ) const
   {
      return tnlCudaMax( data1[ idx1 ], tnlCudaMax( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) );
   };

   __device__ RealType firstReductionOnDevice( const IndexType idx1,
                                               const IndexType idx2,
                                               const RealType* data1,
                                               const RealType* data2,
                                               const RealType* data3 ) const
   {
      return tnlCudaMax( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) );
   };

   __device__ RealType commonReductionOnDevice( const IndexType idx1,
                                                const IndexType idx2,
                                                const RealType* data ) const
   {
      return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
   };
};


#include <implementation/core/cuda/reduction-operations_impl.h>
+1 −1
Original line number Diff line number Diff line
@@ -2,7 +2,7 @@ ADD_SUBDIRECTORY( cuda )

SET( headers cuda-long-vector-kernels.h
             vector-operations.h
             memory-functions.h
             memory-operations.h
             tnlArray_impl.h
             tnlHost_impl.h
             tnlLogger_impl.h
Loading