Commit 490cf20e authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Autodetection of the CUDA architecture added.

Parallel reduction in CUDA fixed.
Boundary conditions in linear system assembler fixed.
Min. iterations for linear solvers fixed.
parent edfd59d9
Loading
Loading
Loading
Loading
+42 −2
Original line number Diff line number Diff line
@@ -42,10 +42,50 @@ if( WITH_CUDA STREQUAL "yes" )
        set(CUSPARSE_LIBRARY /usr/local/cuda/lib64/libcusparse.so) # TODO: fix this              
        set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA )
        AddCompilerFlag( "-DHAVE_NOT_CXX11" ) # -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128 " )
        set( CUDA_ADD_EXECUTABLE_OPTIONS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 )
        set( CUDA_ADD_LIBRARY_OPTIONS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -shared )
        set( ALL_CUDA_ARCHS -gencode arch=compute_20,code=sm_20
                            -gencode arch=compute_30,code=sm_30
                            -gencode arch=compute_32,code=sm_32 
                            -gencode arch=compute_37,code=sm_37 
                            -gencode arch=compute_37,code=sm_37 
                            -gencode arch=compute_50,code=sm_50 
                            -gencode arch=compute_52,code=sm_52 )
        if( WITH_CUDA_ARCH STREQUAL "all" )
           set( CUDA_ARCH ${ALL_CUDA_ARCHS} )   
        else()
            if( WITH_CUDA_ARCH STREQUAL "auto")
                ####
                # Select GPU architecture
                #
                set( CUDA_ARCH_EXECUTABLE ${EXECUTABLE_OUTPUT_PATH}/cuda-arch-options)
                set( CUDA_ARCH_SOURCE ${PROJECT_SOURCE_DIR}/tools/src/cuda-arch-options.cu)
                message( "Compiling device-arch-options ..." )
                file( MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} )
                execute_process( COMMAND nvcc ${CUDA_ARCH_SOURCE} -o ${CUDA_ARCH_EXECUTABLE}
                                 RESULT_VARIABLE CUDA_ARCH_RESULT
                                 OUTPUT_VARIABLE CUDA_ARCH_OUTPUT
                                 ERROR_VARIABLE CUDA_ARCH_OUTPUT )
                execute_process( COMMAND ${CUDA_ARCH_EXECUTABLE}
                                 OUTPUT_VARIABLE CUDA_ARCH )
                if( NOT CUDA_ARCH_RESULT )
                    # strip linebreaks and convert to list delimited with ';'
                    string( REGEX REPLACE "[\n ]" ";" CUDA_ARCH ${CUDA_ARCH} )
                    # cache the result
                    set( CUDA_ARCH ${CUDA_ARCH} CACHE LIST "GPU architecture options" )
                else()
                    message( "Failed to detect GPU architecture:\n${CUDA_ARCH_OUTPUT}" )
                    message( "Using (almost) all GPU architectures as fallback." )
                    set( CUDA_ARCH ${ALL_CUDA_ARCHS} )
                endif()
                message( "GPU architecture options:  ${CUDA_ARCH}" )
            else()
                set( CUDA_ARCH -gencode arch=compute_${WITH_CUDA_ARCH},code=sm_${WITH_CUDA_ARCH} )
            endif()
        endif()
        set( CUDA_ADD_EXECUTABLE_OPTIONS ${CUDA_ARCH} )
        set( CUDA_ADD_LIBRARY_OPTIONS ${CUDA_ARCH} -shared )
        set( CUDA_LINKER_OPTIONS "-arch sm_20 -shared " )


        ####
        # Check for cuBLAS
        #
+6 −1
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ PREFIX=${HOME}/local
WITH_CUDA="yes"
WITH_TESTS="yes"

WITH_CUDA_ARCH="auto"
WITH_CUBLAS="no"
WITH_TEMPLATE_INSTANTIATION="yes"
INSTANTIATE_LONG_INT="yes"
@@ -27,6 +28,7 @@ do
        --with-tests=*                 ) WITH_TESTS="${option#*=}" ;;
        --with-cuda=*                  ) WITH_CUDA="${option#*=}" ;;
        --with-cublas=*                ) WITH_CUBLAS="${option#*=}" ;;
        --with-cuda-arch=*             ) WITH_CUDA_ARCH="${option#*=}";;
        --with-templates-instantiation ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;;
        --instantiate-long-int=*       ) INSTANTIATE_LONG_INT="${option#*=}" ;;
        --instantiate-int=*            ) INSTANTIATE_INT="${option#*=}" ;;
@@ -37,7 +39,8 @@ do
                                         INSTANTIATE_INT="yes"
                                         INSTANTIATE_LONG_DOUBLE="no"
                                         INSTANTIATE_DOUBLE="yes"
                                         INSTANTIATE_FLOAT="no" ;;
                                         INSTANTIATE_FLOAT="no"
                                         WITH_CUDA_ARCH="auto" ;;
        --with-cmake=*                 ) CMAKE="${option#*=}" ;;
        --build-jobs=*                 ) BUILD_JOBS="${option#*=}" ;;
        --cmake-only=*                 ) CMAKE_ONLY="${option#*=}" ;;
@@ -58,6 +61,7 @@ then
    echo "   --build=Debug/Release                 Build type."
    echo "   --with-tests=yes/no                   Enable unit tests. 'yes' by default (libcppunit-dev is required)."
    echo "   --with-cuda=yes/no                    Enable CUDA. 'yes' by default (CUDA Toolkit is required)."
    echo "   --with-cuda-arch=all/auto/30/35/...   Choose CUDA architecture."   
    echo "   --with-templates-instantiation=yes/no Some TNL templates are precompiled during the build. 'yes' by default."
    echo "   --with-cmake=CMAKE                    Path to cmake. 'cmake' by default."
    echo "   --build-jobs=NUM                      Number of processes to be used for the build. It is set to a number of CPU cores by default."
@@ -73,6 +77,7 @@ ${CMAKE} ${ROOT_DIR} \
         -DCMAKE_BUILD_TYPE=${BUILD} \
         -DCMAKE_INSTALL_PREFIX=${PREFIX} \
         -DWITH_CUDA=${WITH_CUDA} \
         -DWITH_CUDA_ARCH=${WITH_CUDA_ARCH}
         -DWITH_CUBLAS=${WITH_CUBLAS} \
         -DWITH_TESTS=${WITH_TESTS} \
         -DPETSC_DIR=${PETSC_DIR} \
+2 −2
Original line number Diff line number Diff line
@@ -47,7 +47,7 @@ template< typename Operation >
__device__ void reduceAligned( const Operation& operation,
                               typename Operation :: IndexType tid,
                               typename Operation :: IndexType  s,
                               typename Operation :: ResultType* sdata )
                               volatile typename Operation :: ResultType* sdata )
{
   if( tid < s )
   {
@@ -67,7 +67,7 @@ __device__ void reduceNonAligned( const Operation& operation,
                                  typename Operation :: IndexType tid,
                                  typename Operation :: IndexType s,
                                  typename Operation :: IndexType n,
                                  typename Operation :: ResultType* sdata )
                                  volatile typename Operation :: ResultType* sdata )
{
   if( tid < s )
   {
+98 −21
Original line number Diff line number Diff line
@@ -50,6 +50,31 @@ __device__ inline double tnlCudaMin( const double& a,
   return fmin( a, b );
}

template< class T > __device__ T tnlCudaMin( volatile const T& a,
                                             volatile const T& b )
{
   return a < b ? a : b;
}

__device__ inline int tnlCudaMin( volatile const int& a,
                                  volatile const int& b )
{
   return min( a, b );
}

__device__ inline  float tnlCudaMin( volatile const float& a,
                                     volatile const float& b )
{
   return fminf( a, b );
}

__device__ inline  double tnlCudaMin( volatile const double& a,
                                      volatile const double& b )
{
   return fmin( a, b );
}


/***
 * This function returns maximum of two numbers stored on the device.
 */
@@ -77,6 +102,30 @@ __device__ inline double tnlCudaMax( const double& a,
   return fmax( a, b );
}

template< class T > __device__ T tnlCudaMax( volatile const T& a,
                                             volatile const T& b )
{
   return a > b ? a : b;
}

__device__  inline int tnlCudaMax( volatile const int& a,
                                   volatile const int& b )
{
   return max( a, b );
}

__device__  inline float tnlCudaMax( volatile const float& a,
                                     volatile const float& b )
{
   return fmaxf( a, b );
}

__device__  inline double tnlCudaMax( volatile const double& a,
                                      volatile const double& b )
{
   return fmax( a, b );
}

/***
 * This function returns absolute value of given number on the device.
 */
@@ -105,6 +154,32 @@ __device__ inline long double tnlCudaAbs( const long double& a )
   return fabs( ( double ) a );
}

__device__  inline int tnlCudaAbs( volatile const int& a )
{
   return abs( a );
}

__device__  inline long int tnlCudaAbs( volatile const long int& a )
{
   return abs( a );
}

__device__  inline float tnlCudaAbs( volatile const float& a )
{
   return fabs( a );
}

__device__  inline double tnlCudaAbs( volatile const double& a )
{
   return fabs( a );
}

__device__  inline long double tnlCudaAbs( volatile const long double& a )
{
   return fabs( ( double ) a );
}


template< typename Type1, typename Type2 >
__device__ Type1 tnlCudaPow( const Type1& x, const Type2& power )
{
@@ -173,7 +248,7 @@ class tnlParallelReductionSum

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return data[ idx1 ] + data[ idx2 ];
   };
@@ -261,7 +336,7 @@ class tnlParallelReductionMin

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
   };
@@ -330,7 +405,7 @@ class tnlParallelReductionMax

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
   };
@@ -399,7 +474,7 @@ class tnlParallelReductionAbsSum

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return data[ idx1 ] + data[ idx2 ];
   };
@@ -468,9 +543,10 @@ class tnlParallelReductionAbsMin

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
      volatile ResultType aux = tnlCudaAbs( data[ idx2 ] );
      return tnlCudaMin( data[ idx1 ],  aux );
   };
#endif
};
@@ -537,9 +613,10 @@ class tnlParallelReductionAbsMax

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
      volatile ResultType aux = tnlCudaAbs( data[ idx2 ] );
      return tnlCudaMax( data[ idx1 ], aux );
   };
#endif
};
@@ -606,7 +683,7 @@ class tnlParallelReductionLogicalAnd

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return data[ idx1 ] && data[ idx2 ];
   };
@@ -676,7 +753,7 @@ class tnlParallelReductionLogicalOr

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return data[ idx1 ] || data[ idx2 ];
   };
@@ -752,7 +829,7 @@ class tnlParallelReductionLpNorm

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return data[ idx1 ] + data[ idx2 ];
   };
@@ -827,7 +904,7 @@ class tnlParallelReductionEqualities

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return data[ idx1 ] && data[ idx2 ];
   };
@@ -898,7 +975,7 @@ class tnlParallelReductionInequalities

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return data[ idx1 ] && data[ idx2 ];
   };
@@ -969,7 +1046,7 @@ class tnlParallelReductionScalarProduct

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return data[ idx1 ] + data[ idx2 ];
   };
@@ -1039,7 +1116,7 @@ class tnlParallelReductionDiffSum

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return data[ idx1 ] + data[ idx2 ];
   };
@@ -1110,7 +1187,7 @@ class tnlParallelReductionDiffMin

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
   };
@@ -1182,7 +1259,7 @@ class tnlParallelReductionDiffMax

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
   };
@@ -1254,7 +1331,7 @@ class tnlParallelReductionDiffAbsSum

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return data[ idx1 ] + data[ idx2 ];
   };
@@ -1327,7 +1404,7 @@ class tnlParallelReductionDiffAbsMin

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      //return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
      return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
@@ -1401,7 +1478,7 @@ class tnlParallelReductionDiffAbsMax

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      //return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
      return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
@@ -1479,7 +1556,7 @@ class tnlParallelReductionDiffLpNorm

   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const ResultType* data ) const
                                                  volatile const ResultType* data ) const
   {
      return data[ idx1 ] + data[ idx2 ];
   };
+1 −3
Original line number Diff line number Diff line
@@ -648,7 +648,7 @@ void tnlCSRMatrix< Real, Device, Index >::spmvCudaVectorized( const InVector& in
                                                              const IndexType warpEnd,
                                                              const IndexType inWarpIdx ) const
{
   Real* aux = getSharedMemory< Real >();
   volatile Real* aux = getSharedMemory< Real >();
   for( IndexType row = warpStart; row < warpEnd; row++ )
   {
      aux[ threadIdx.x ] = 0.0;
@@ -672,8 +672,6 @@ void tnlCSRMatrix< Real, Device, Index >::spmvCudaVectorized( const InVector& in
         if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ];
      if( warpSize >= 2 )
         if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ];
      __syncthreads(); // TODO: I am not sure why - aux must be volatile

      if( inWarpIdx == 0 )
         outVector[ row ] = aux[ threadIdx.x ];
   }
Loading