Loading CMakeLists.txt +42 −2 Original line number Diff line number Diff line Loading @@ -42,10 +42,50 @@ if( WITH_CUDA STREQUAL "yes" ) set(CUSPARSE_LIBRARY /usr/local/cuda/lib64/libcusparse.so) # TODO: fix this set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA ) AddCompilerFlag( "-DHAVE_NOT_CXX11" ) # -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128 " ) set( CUDA_ADD_EXECUTABLE_OPTIONS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 ) set( CUDA_ADD_LIBRARY_OPTIONS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -shared ) set( ALL_CUDA_ARCHS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_32,code=sm_32 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 ) if( WITH_CUDA_ARCH STREQUAL "all" ) set( CUDA_ARCH ${ALL_CUDA_ARCHS} ) else() if( WITH_CUDA_ARCH STREQUAL "auto") #### # Select GPU architecture # set( CUDA_ARCH_EXECUTABLE ${EXECUTABLE_OUTPUT_PATH}/cuda-arch-options) set( CUDA_ARCH_SOURCE ${PROJECT_SOURCE_DIR}/tools/src/cuda-arch-options.cu) message( "Compiling device-arch-options ..." ) file( MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} ) execute_process( COMMAND nvcc ${CUDA_ARCH_SOURCE} -o ${CUDA_ARCH_EXECUTABLE} RESULT_VARIABLE CUDA_ARCH_RESULT OUTPUT_VARIABLE CUDA_ARCH_OUTPUT ERROR_VARIABLE CUDA_ARCH_OUTPUT ) execute_process( COMMAND ${CUDA_ARCH_EXECUTABLE} OUTPUT_VARIABLE CUDA_ARCH ) if( NOT CUDA_ARCH_RESULT ) # strip linebreaks and convert to list delimited with ';' string( REGEX REPLACE "[\n ]" ";" CUDA_ARCH ${CUDA_ARCH} ) # cache the result set( CUDA_ARCH ${CUDA_ARCH} CACHE LIST "GPU architecture options" ) else() message( "Failed to detect GPU architecture:\n${CUDA_ARCH_OUTPUT}" ) message( "Using (almost) all GPU architectures as fallback." ) set( CUDA_ARCH ${ALL_CUDA_ARCHS} ) endif() message( "GPU architecture options: ${CUDA_ARCH}" ) else() set( CUDA_ARCH -gencode arch=compute_${WITH_CUDA_ARCH},code=sm_${WITH_CUDA_ARCH} ) endif() endif() set( CUDA_ADD_EXECUTABLE_OPTIONS ${CUDA_ARCH} ) set( CUDA_ADD_LIBRARY_OPTIONS ${CUDA_ARCH} -shared ) set( CUDA_LINKER_OPTIONS "-arch sm_20 -shared " ) #### # Check for cuBLAS # Loading build +6 −1 Original line number Diff line number Diff line Loading @@ -5,6 +5,7 @@ PREFIX=${HOME}/local WITH_CUDA="yes" WITH_TESTS="yes" WITH_CUDA_ARCH="auto" WITH_CUBLAS="no" WITH_TEMPLATE_INSTANTIATION="yes" INSTANTIATE_LONG_INT="yes" Loading @@ -27,6 +28,7 @@ do --with-tests=* ) WITH_TESTS="${option#*=}" ;; --with-cuda=* ) WITH_CUDA="${option#*=}" ;; --with-cublas=* ) WITH_CUBLAS="${option#*=}" ;; --with-cuda-arch=* ) WITH_CUDA_ARCH="${option#*=}";; --with-templates-instantiation ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;; --instantiate-long-int=* ) INSTANTIATE_LONG_INT="${option#*=}" ;; --instantiate-int=* ) INSTANTIATE_INT="${option#*=}" ;; Loading @@ -37,7 +39,8 @@ do INSTANTIATE_INT="yes" INSTANTIATE_LONG_DOUBLE="no" INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" ;; INSTANTIATE_FLOAT="no" WITH_CUDA_ARCH="auto" ;; --with-cmake=* ) CMAKE="${option#*=}" ;; --build-jobs=* ) BUILD_JOBS="${option#*=}" ;; --cmake-only=* ) CMAKE_ONLY="${option#*=}" ;; Loading @@ -58,6 +61,7 @@ then echo " --build=Debug/Release Build type." echo " --with-tests=yes/no Enable unit tests. 'yes' by default (libcppunit-dev is required)." echo " --with-cuda=yes/no Enable CUDA. 'yes' by default (CUDA Toolkit is required)." echo " --with-cuda-arch=all/auto/30/35/... Choose CUDA architecture." echo " --with-templates-instantiation=yes/no Some TNL templates are precompiled during the build. 'yes' by default." echo " --with-cmake=CMAKE Path to cmake. 'cmake' by default." echo " --build-jobs=NUM Number of processes to be used for the build. It is set to a number of CPU cores by default." Loading @@ -73,6 +77,7 @@ ${CMAKE} ${ROOT_DIR} \ -DCMAKE_BUILD_TYPE=${BUILD} \ -DCMAKE_INSTALL_PREFIX=${PREFIX} \ -DWITH_CUDA=${WITH_CUDA} \ -DWITH_CUDA_ARCH=${WITH_CUDA_ARCH} -DWITH_CUBLAS=${WITH_CUBLAS} \ -DWITH_TESTS=${WITH_TESTS} \ -DPETSC_DIR=${PETSC_DIR} \ Loading src/core/cuda/cuda-reduction_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -47,7 +47,7 @@ template< typename Operation > __device__ void reduceAligned( const Operation& operation, typename Operation :: IndexType tid, typename Operation :: IndexType s, typename Operation :: ResultType* sdata ) volatile typename Operation :: ResultType* sdata ) { if( tid < s ) { Loading @@ -67,7 +67,7 @@ __device__ void reduceNonAligned( const Operation& operation, typename Operation :: IndexType tid, typename Operation :: IndexType s, typename Operation :: IndexType n, typename Operation :: ResultType* sdata ) volatile typename Operation :: ResultType* sdata ) { if( tid < s ) { Loading src/core/cuda/reduction-operations.h +98 −21 Original line number Diff line number Diff line Loading @@ -50,6 +50,31 @@ __device__ inline double tnlCudaMin( const double& a, return fmin( a, b ); } template< class T > __device__ T tnlCudaMin( volatile const T& a, volatile const T& b ) { return a < b ? a : b; } __device__ inline int tnlCudaMin( volatile const int& a, volatile const int& b ) { return min( a, b ); } __device__ inline float tnlCudaMin( volatile const float& a, volatile const float& b ) { return fminf( a, b ); } __device__ inline double tnlCudaMin( volatile const double& a, volatile const double& b ) { return fmin( a, b ); } /*** * This function returns maximum of two numbers stored on the device. */ Loading Loading @@ -77,6 +102,30 @@ __device__ inline double tnlCudaMax( const double& a, return fmax( a, b ); } template< class T > __device__ T tnlCudaMax( volatile const T& a, volatile const T& b ) { return a > b ? a : b; } __device__ inline int tnlCudaMax( volatile const int& a, volatile const int& b ) { return max( a, b ); } __device__ inline float tnlCudaMax( volatile const float& a, volatile const float& b ) { return fmaxf( a, b ); } __device__ inline double tnlCudaMax( volatile const double& a, volatile const double& b ) { return fmax( a, b ); } /*** * This function returns absolute value of given number on the device. */ Loading Loading @@ -105,6 +154,32 @@ __device__ inline long double tnlCudaAbs( const long double& a ) return fabs( ( double ) a ); } __device__ inline int tnlCudaAbs( volatile const int& a ) { return abs( a ); } __device__ inline long int tnlCudaAbs( volatile const long int& a ) { return abs( a ); } __device__ inline float tnlCudaAbs( volatile const float& a ) { return fabs( a ); } __device__ inline double tnlCudaAbs( volatile const double& a ) { return fabs( a ); } __device__ inline long double tnlCudaAbs( volatile const long double& a ) { return fabs( ( double ) a ); } template< typename Type1, typename Type2 > __device__ Type1 tnlCudaPow( const Type1& x, const Type2& power ) { Loading Loading @@ -173,7 +248,7 @@ class tnlParallelReductionSum __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading Loading @@ -261,7 +336,7 @@ class tnlParallelReductionMin __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return tnlCudaMin( data[ idx1 ], data[ idx2 ] ); }; Loading Loading @@ -330,7 +405,7 @@ class tnlParallelReductionMax __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); }; Loading Loading @@ -399,7 +474,7 @@ class tnlParallelReductionAbsSum __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading Loading @@ -468,9 +543,10 @@ class tnlParallelReductionAbsMin __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); volatile ResultType aux = tnlCudaAbs( data[ idx2 ] ); return tnlCudaMin( data[ idx1 ], aux ); }; #endif }; Loading Loading @@ -537,9 +613,10 @@ class tnlParallelReductionAbsMax __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); volatile ResultType aux = tnlCudaAbs( data[ idx2 ] ); return tnlCudaMax( data[ idx1 ], aux ); }; #endif }; Loading Loading @@ -606,7 +683,7 @@ class tnlParallelReductionLogicalAnd __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] && data[ idx2 ]; }; Loading Loading @@ -676,7 +753,7 @@ class tnlParallelReductionLogicalOr __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] || data[ idx2 ]; }; Loading Loading @@ -752,7 +829,7 @@ class tnlParallelReductionLpNorm __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading Loading @@ -827,7 +904,7 @@ class tnlParallelReductionEqualities __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] && data[ idx2 ]; }; Loading Loading @@ -898,7 +975,7 @@ class tnlParallelReductionInequalities __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] && data[ idx2 ]; }; Loading Loading @@ -969,7 +1046,7 @@ class tnlParallelReductionScalarProduct __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading Loading @@ -1039,7 +1116,7 @@ class tnlParallelReductionDiffSum __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading Loading @@ -1110,7 +1187,7 @@ class tnlParallelReductionDiffMin __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return tnlCudaMin( data[ idx1 ], data[ idx2 ] ); }; Loading Loading @@ -1182,7 +1259,7 @@ class tnlParallelReductionDiffMax __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); }; Loading Loading @@ -1254,7 +1331,7 @@ class tnlParallelReductionDiffAbsSum __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading Loading @@ -1327,7 +1404,7 @@ class tnlParallelReductionDiffAbsMin __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { //return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); return tnlCudaMin( data[ idx1 ], data[ idx2 ] ); Loading Loading @@ -1401,7 +1478,7 @@ class tnlParallelReductionDiffAbsMax __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { //return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); Loading Loading @@ -1479,7 +1556,7 @@ class tnlParallelReductionDiffLpNorm __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading src/matrices/tnlCSRMatrix_impl.h +1 −3 Original line number Diff line number Diff line Loading @@ -648,7 +648,7 @@ void tnlCSRMatrix< Real, Device, Index >::spmvCudaVectorized( const InVector& in const IndexType warpEnd, const IndexType inWarpIdx ) const { Real* aux = getSharedMemory< Real >(); volatile Real* aux = getSharedMemory< Real >(); for( IndexType row = warpStart; row < warpEnd; row++ ) { aux[ threadIdx.x ] = 0.0; Loading @@ -672,8 +672,6 @@ void tnlCSRMatrix< Real, Device, Index >::spmvCudaVectorized( const InVector& in if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ]; if( warpSize >= 2 ) if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ]; __syncthreads(); // TODO: I am not sure why - aux must be volatile if( inWarpIdx == 0 ) outVector[ row ] = aux[ threadIdx.x ]; } Loading Loading
CMakeLists.txt +42 −2 Original line number Diff line number Diff line Loading @@ -42,10 +42,50 @@ if( WITH_CUDA STREQUAL "yes" ) set(CUSPARSE_LIBRARY /usr/local/cuda/lib64/libcusparse.so) # TODO: fix this set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA ) AddCompilerFlag( "-DHAVE_NOT_CXX11" ) # -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128 " ) set( CUDA_ADD_EXECUTABLE_OPTIONS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 ) set( CUDA_ADD_LIBRARY_OPTIONS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -shared ) set( ALL_CUDA_ARCHS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_32,code=sm_32 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 ) if( WITH_CUDA_ARCH STREQUAL "all" ) set( CUDA_ARCH ${ALL_CUDA_ARCHS} ) else() if( WITH_CUDA_ARCH STREQUAL "auto") #### # Select GPU architecture # set( CUDA_ARCH_EXECUTABLE ${EXECUTABLE_OUTPUT_PATH}/cuda-arch-options) set( CUDA_ARCH_SOURCE ${PROJECT_SOURCE_DIR}/tools/src/cuda-arch-options.cu) message( "Compiling device-arch-options ..." ) file( MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} ) execute_process( COMMAND nvcc ${CUDA_ARCH_SOURCE} -o ${CUDA_ARCH_EXECUTABLE} RESULT_VARIABLE CUDA_ARCH_RESULT OUTPUT_VARIABLE CUDA_ARCH_OUTPUT ERROR_VARIABLE CUDA_ARCH_OUTPUT ) execute_process( COMMAND ${CUDA_ARCH_EXECUTABLE} OUTPUT_VARIABLE CUDA_ARCH ) if( NOT CUDA_ARCH_RESULT ) # strip linebreaks and convert to list delimited with ';' string( REGEX REPLACE "[\n ]" ";" CUDA_ARCH ${CUDA_ARCH} ) # cache the result set( CUDA_ARCH ${CUDA_ARCH} CACHE LIST "GPU architecture options" ) else() message( "Failed to detect GPU architecture:\n${CUDA_ARCH_OUTPUT}" ) message( "Using (almost) all GPU architectures as fallback." ) set( CUDA_ARCH ${ALL_CUDA_ARCHS} ) endif() message( "GPU architecture options: ${CUDA_ARCH}" ) else() set( CUDA_ARCH -gencode arch=compute_${WITH_CUDA_ARCH},code=sm_${WITH_CUDA_ARCH} ) endif() endif() set( CUDA_ADD_EXECUTABLE_OPTIONS ${CUDA_ARCH} ) set( CUDA_ADD_LIBRARY_OPTIONS ${CUDA_ARCH} -shared ) set( CUDA_LINKER_OPTIONS "-arch sm_20 -shared " ) #### # Check for cuBLAS # Loading
build +6 −1 Original line number Diff line number Diff line Loading @@ -5,6 +5,7 @@ PREFIX=${HOME}/local WITH_CUDA="yes" WITH_TESTS="yes" WITH_CUDA_ARCH="auto" WITH_CUBLAS="no" WITH_TEMPLATE_INSTANTIATION="yes" INSTANTIATE_LONG_INT="yes" Loading @@ -27,6 +28,7 @@ do --with-tests=* ) WITH_TESTS="${option#*=}" ;; --with-cuda=* ) WITH_CUDA="${option#*=}" ;; --with-cublas=* ) WITH_CUBLAS="${option#*=}" ;; --with-cuda-arch=* ) WITH_CUDA_ARCH="${option#*=}";; --with-templates-instantiation ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;; --instantiate-long-int=* ) INSTANTIATE_LONG_INT="${option#*=}" ;; --instantiate-int=* ) INSTANTIATE_INT="${option#*=}" ;; Loading @@ -37,7 +39,8 @@ do INSTANTIATE_INT="yes" INSTANTIATE_LONG_DOUBLE="no" INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" ;; INSTANTIATE_FLOAT="no" WITH_CUDA_ARCH="auto" ;; --with-cmake=* ) CMAKE="${option#*=}" ;; --build-jobs=* ) BUILD_JOBS="${option#*=}" ;; --cmake-only=* ) CMAKE_ONLY="${option#*=}" ;; Loading @@ -58,6 +61,7 @@ then echo " --build=Debug/Release Build type." echo " --with-tests=yes/no Enable unit tests. 'yes' by default (libcppunit-dev is required)." echo " --with-cuda=yes/no Enable CUDA. 'yes' by default (CUDA Toolkit is required)." echo " --with-cuda-arch=all/auto/30/35/... Choose CUDA architecture." echo " --with-templates-instantiation=yes/no Some TNL templates are precompiled during the build. 'yes' by default." echo " --with-cmake=CMAKE Path to cmake. 'cmake' by default." echo " --build-jobs=NUM Number of processes to be used for the build. It is set to a number of CPU cores by default." Loading @@ -73,6 +77,7 @@ ${CMAKE} ${ROOT_DIR} \ -DCMAKE_BUILD_TYPE=${BUILD} \ -DCMAKE_INSTALL_PREFIX=${PREFIX} \ -DWITH_CUDA=${WITH_CUDA} \ -DWITH_CUDA_ARCH=${WITH_CUDA_ARCH} -DWITH_CUBLAS=${WITH_CUBLAS} \ -DWITH_TESTS=${WITH_TESTS} \ -DPETSC_DIR=${PETSC_DIR} \ Loading
src/core/cuda/cuda-reduction_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -47,7 +47,7 @@ template< typename Operation > __device__ void reduceAligned( const Operation& operation, typename Operation :: IndexType tid, typename Operation :: IndexType s, typename Operation :: ResultType* sdata ) volatile typename Operation :: ResultType* sdata ) { if( tid < s ) { Loading @@ -67,7 +67,7 @@ __device__ void reduceNonAligned( const Operation& operation, typename Operation :: IndexType tid, typename Operation :: IndexType s, typename Operation :: IndexType n, typename Operation :: ResultType* sdata ) volatile typename Operation :: ResultType* sdata ) { if( tid < s ) { Loading
src/core/cuda/reduction-operations.h +98 −21 Original line number Diff line number Diff line Loading @@ -50,6 +50,31 @@ __device__ inline double tnlCudaMin( const double& a, return fmin( a, b ); } template< class T > __device__ T tnlCudaMin( volatile const T& a, volatile const T& b ) { return a < b ? a : b; } __device__ inline int tnlCudaMin( volatile const int& a, volatile const int& b ) { return min( a, b ); } __device__ inline float tnlCudaMin( volatile const float& a, volatile const float& b ) { return fminf( a, b ); } __device__ inline double tnlCudaMin( volatile const double& a, volatile const double& b ) { return fmin( a, b ); } /*** * This function returns maximum of two numbers stored on the device. */ Loading Loading @@ -77,6 +102,30 @@ __device__ inline double tnlCudaMax( const double& a, return fmax( a, b ); } template< class T > __device__ T tnlCudaMax( volatile const T& a, volatile const T& b ) { return a > b ? a : b; } __device__ inline int tnlCudaMax( volatile const int& a, volatile const int& b ) { return max( a, b ); } __device__ inline float tnlCudaMax( volatile const float& a, volatile const float& b ) { return fmaxf( a, b ); } __device__ inline double tnlCudaMax( volatile const double& a, volatile const double& b ) { return fmax( a, b ); } /*** * This function returns absolute value of given number on the device. */ Loading Loading @@ -105,6 +154,32 @@ __device__ inline long double tnlCudaAbs( const long double& a ) return fabs( ( double ) a ); } __device__ inline int tnlCudaAbs( volatile const int& a ) { return abs( a ); } __device__ inline long int tnlCudaAbs( volatile const long int& a ) { return abs( a ); } __device__ inline float tnlCudaAbs( volatile const float& a ) { return fabs( a ); } __device__ inline double tnlCudaAbs( volatile const double& a ) { return fabs( a ); } __device__ inline long double tnlCudaAbs( volatile const long double& a ) { return fabs( ( double ) a ); } template< typename Type1, typename Type2 > __device__ Type1 tnlCudaPow( const Type1& x, const Type2& power ) { Loading Loading @@ -173,7 +248,7 @@ class tnlParallelReductionSum __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading Loading @@ -261,7 +336,7 @@ class tnlParallelReductionMin __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return tnlCudaMin( data[ idx1 ], data[ idx2 ] ); }; Loading Loading @@ -330,7 +405,7 @@ class tnlParallelReductionMax __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); }; Loading Loading @@ -399,7 +474,7 @@ class tnlParallelReductionAbsSum __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading Loading @@ -468,9 +543,10 @@ class tnlParallelReductionAbsMin __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); volatile ResultType aux = tnlCudaAbs( data[ idx2 ] ); return tnlCudaMin( data[ idx1 ], aux ); }; #endif }; Loading Loading @@ -537,9 +613,10 @@ class tnlParallelReductionAbsMax __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); volatile ResultType aux = tnlCudaAbs( data[ idx2 ] ); return tnlCudaMax( data[ idx1 ], aux ); }; #endif }; Loading Loading @@ -606,7 +683,7 @@ class tnlParallelReductionLogicalAnd __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] && data[ idx2 ]; }; Loading Loading @@ -676,7 +753,7 @@ class tnlParallelReductionLogicalOr __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] || data[ idx2 ]; }; Loading Loading @@ -752,7 +829,7 @@ class tnlParallelReductionLpNorm __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading Loading @@ -827,7 +904,7 @@ class tnlParallelReductionEqualities __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] && data[ idx2 ]; }; Loading Loading @@ -898,7 +975,7 @@ class tnlParallelReductionInequalities __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] && data[ idx2 ]; }; Loading Loading @@ -969,7 +1046,7 @@ class tnlParallelReductionScalarProduct __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading Loading @@ -1039,7 +1116,7 @@ class tnlParallelReductionDiffSum __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading Loading @@ -1110,7 +1187,7 @@ class tnlParallelReductionDiffMin __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return tnlCudaMin( data[ idx1 ], data[ idx2 ] ); }; Loading Loading @@ -1182,7 +1259,7 @@ class tnlParallelReductionDiffMax __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); }; Loading Loading @@ -1254,7 +1331,7 @@ class tnlParallelReductionDiffAbsSum __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading Loading @@ -1327,7 +1404,7 @@ class tnlParallelReductionDiffAbsMin __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { //return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); return tnlCudaMin( data[ idx1 ], data[ idx2 ] ); Loading Loading @@ -1401,7 +1478,7 @@ class tnlParallelReductionDiffAbsMax __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { //return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); Loading Loading @@ -1479,7 +1556,7 @@ class tnlParallelReductionDiffLpNorm __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const ResultType* data ) const volatile const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading
src/matrices/tnlCSRMatrix_impl.h +1 −3 Original line number Diff line number Diff line Loading @@ -648,7 +648,7 @@ void tnlCSRMatrix< Real, Device, Index >::spmvCudaVectorized( const InVector& in const IndexType warpEnd, const IndexType inWarpIdx ) const { Real* aux = getSharedMemory< Real >(); volatile Real* aux = getSharedMemory< Real >(); for( IndexType row = warpStart; row < warpEnd; row++ ) { aux[ threadIdx.x ] = 0.0; Loading @@ -672,8 +672,6 @@ void tnlCSRMatrix< Real, Device, Index >::spmvCudaVectorized( const InVector& in if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ]; if( warpSize >= 2 ) if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ]; __syncthreads(); // TODO: I am not sure why - aux must be volatile if( inWarpIdx == 0 ) outVector[ row ] = aux[ threadIdx.x ]; } Loading