Commit 6ae56ba0 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Fixing the CUDA parallel reduction.

parent 7ed08fd0
Loading
Loading
Loading
Loading
+13 −4
Original line number Diff line number Diff line
@@ -28,9 +28,15 @@ else()
    AddCompilerFlag( "-O3 -DNDEBUG" )
endif()

if( WITH_TEMPLATE_EXPLICIT_INSTANTIATION STREQUAL "yes" )
   AddCompilerFlag( "-DTEMPLATE_EXPLICIT_INSTANTIATION" )
endif()   

if( WITH_CUDA STREQUAL "yes" )
   AddCompilerFlag( "-DHAVE_NOT_CXX11" )
else()
   AddCompilerFlag( "-std=gnu++0x" )
#AddCompilerFlag( "-DHAVE_NOT_CXX11" )
endif()      


#####
@@ -288,7 +294,10 @@ if(WIN32 AND NOT UNIX)
  set(CPACK_NSIS_MODIFY_PATH ON)
else(WIN32 AND NOT UNIX)
#  set(CPACK_STRIP_FILES "bin/MyExecutable")
  set(CPACK_SOURCE_STRIP_FILES "")
endif(WIN32 AND NOT UNIX)

set(CPACK_SOURCE_STRIP_FILES "Debug")
set(CPACK_SOURCE_STRIP_FILES "Release")

#set(CPACK_PACKAGE_EXECUTABLES "MyExecutable" "My Executable")
INCLUDE( CPack )
+15 −4
Original line number Diff line number Diff line
@@ -2,14 +2,14 @@

TARGET=TNL
INSTALL_PREFIX=${HOME}/local
WITH_CUDA=no
WITH_CUDA=yes
WITH_CUSPARSE=no
CUDA_ARCHITECTURE=2.0
TEMPLATE_EXPLICIT_INSTANTIATION=yes
VERBOSE=1

CMAKE="cmake"
CPUS=`grep -c processor /proc/cpuinfo`
CPUS=1

echo "Building $TARGET using $CPUS processors."

@@ -23,13 +23,24 @@ then
fi

cd Debug
${CMAKE} .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR}
${CMAKE} .. -DCMAKE_BUILD_TYPE=Debug \
            -DCMAKE_INSTALL_PREFIX=${HOME}/local \
            -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} \
            -DWITH_CUDA=${WITH_CUDA} \
            -DWITH_CUSPARSE=${WITH_CUSPARSE} \
            -DPETSC_DIR=${PETSC_DIR} \
            -DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION}
make -j${CPUS} #VERBOSE=1
make -j${CPUS} test
make -j${CPUS} install

cd ../Release
${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR}
${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local \
            -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} \
            -DWITH_CUDA=${WITH_CUDA} \
            -DWITH_CUSPARSE=${WITH_CUSPARSE} \
            -DPETSC_DIR=${PETSC_DIR} \
            -DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION}
make -j${CPUS} #VERBOSE=1
make -j${CPUS} test
make -j${CPUS} install
+17 −11
Original line number Diff line number Diff line
@@ -93,6 +93,12 @@ __device__ inline double tnlCudaAbs( const double& a )
{
   return fabs( a );
}

template< typename Type1, typename Type2 >
__device__ Type1 tnlCudaPow( const Type1& x, const Type2& power )
{
   return ( Type1 ) pow( ( double ) x, ( double ) power );
}
#endif

template< typename Real, typename Index >
@@ -683,14 +689,14 @@ class tnlParallelReductionLpNorm
                                               const RealType* data1,
                                               const RealType* data2 ) const
   {
      return pow( tnlCudaAbs( data1[ idx1 ] ), p ) + pow( tnlCudaAbs( data1[ idx2 ] ), p );
      return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] ), p ) + tnlCudaPow( tnlCudaAbs( data1[ idx2 ] ), p );
   }

   __device__ ResultType initialValueOnDevice( const IndexType idx1,
                                               const RealType* data1,
                                               const RealType* data2 ) const
   {
      return pow( tnlCudaAbs( data1[ idx1 ] ), p );
      return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] ), p );
   };

   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
@@ -701,8 +707,8 @@ class tnlParallelReductionLpNorm
                                                 const RealType* data3 ) const
   {
      return data1[ idx1 ] +
             pow( tnlCudaAbs( data2[ idx2 ] ), p ) +
             pow( tnlCudaAbs( data2[ idx3 ] ), p );
             tnlCudaPow( tnlCudaAbs( data2[ idx2 ] ), p ) +
             tnlCudaPow( tnlCudaAbs( data2[ idx3 ] ), p );
   };

   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
@@ -711,7 +717,7 @@ class tnlParallelReductionLpNorm
                                                 const RealType* data2,
                                                 const RealType* data3 ) const
   {
      return data1[ idx1 ] + pow( tnlCudaAbs( data2[ idx2 ] ), p );
      return data1[ idx1 ] + tnlCudaPow( tnlCudaAbs( data2[ idx2 ] ), p );
   };

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
@@ -1409,15 +1415,15 @@ class tnlParallelReductionDiffLpNorm
                                               const RealType* data1,
                                               const RealType* data2 ) const
   {
      return pow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p ) +
             pow( tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ), p );
      return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p ) +
             tnlCudaPow( tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ), p );
   }

   __device__ ResultType initialValueOnDevice( const IndexType idx1,
                                               const RealType* data1,
                                               const RealType* data2 ) const
   {
      return pow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p );
      return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p );
   };

   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
@@ -1428,8 +1434,8 @@ class tnlParallelReductionDiffLpNorm
                                                 const RealType* data3 ) const
   {
      return data1[ idx1 ] +
             pow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p ) +
             pow( tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ), p );
             tnlCudaPow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p ) +
             tnlCudaPow( tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ), p );
   };

   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
@@ -1438,7 +1444,7 @@ class tnlParallelReductionDiffLpNorm
                                                 const RealType* data2,
                                                 const RealType* data3 ) const
   {
      return data1[ idx1 ] + pow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p );
      return data1[ idx1 ] + tnlCudaPow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p );
   };

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+4 −4
Original line number Diff line number Diff line
@@ -21,19 +21,19 @@
#include <math.h>
#include <stdlib.h>

template< class T > T Min( const T& a, const T& b )
template< typename Type1, typename Type2 > Type1 Min( const Type1& a, const Type2& b )
{
   return a < b ? a : b;
};

template< class T > T Max( const T& a, const T& b )
template< typename Type1, typename Type2 > Type1 Max( const Type1& a, const Type2& b )
{
   return a > b ? a : b;
};

template< class T > void Swap( T& a, T& b )
template< typename Type > void Swap( Type& a, Type& b )
{
   T tmp( a );
   Type tmp( a );
   a = b;
   b = tmp;
};
+8 −0
Original line number Diff line number Diff line
@@ -21,6 +21,14 @@
#include <core/tnlHost.h>
#include <core/tnlCuda.h>

/****
 * In this code we do not use constants and references as we would like to.
 * OpenMP would complain that
 *
 *  error: ‘some-variable’ is predetermined ‘shared’ for ‘firstprivate’
 *
 */

#ifdef HAVE_CUDA

template< typename Real, typename Index >
Loading