Autodetection of the CUDA architecture added. (490cf20e) · Commits · TNL / tnl-dev

CMakeLists.txt

+42 −2

Original line number	Diff line number	Diff line
		@@ -42,10 +42,50 @@ if( WITH_CUDA STREQUAL "yes" )
		set(CUSPARSE_LIBRARY /usr/local/cuda/lib64/libcusparse.so) # TODO: fix this
		set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA )
		AddCompilerFlag( "-DHAVE_NOT_CXX11" ) # -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128 " )
		set( CUDA_ADD_EXECUTABLE_OPTIONS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 )
		set( CUDA_ADD_LIBRARY_OPTIONS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -shared )
		set( ALL_CUDA_ARCHS -gencode arch=compute_20,code=sm_20
		-gencode arch=compute_30,code=sm_30
		-gencode arch=compute_32,code=sm_32
		-gencode arch=compute_37,code=sm_37
		-gencode arch=compute_37,code=sm_37
		-gencode arch=compute_50,code=sm_50
		-gencode arch=compute_52,code=sm_52 )
		if( WITH_CUDA_ARCH STREQUAL "all" )
		set( CUDA_ARCH ${ALL_CUDA_ARCHS} )
		else()
		if( WITH_CUDA_ARCH STREQUAL "auto")
		####
		# Select GPU architecture
		#
		set( CUDA_ARCH_EXECUTABLE ${EXECUTABLE_OUTPUT_PATH}/cuda-arch-options)
		set( CUDA_ARCH_SOURCE ${PROJECT_SOURCE_DIR}/tools/src/cuda-arch-options.cu)
		message( "Compiling device-arch-options ..." )
		file( MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} )
		execute_process( COMMAND nvcc ${CUDA_ARCH_SOURCE} -o ${CUDA_ARCH_EXECUTABLE}
		RESULT_VARIABLE CUDA_ARCH_RESULT
		OUTPUT_VARIABLE CUDA_ARCH_OUTPUT
		ERROR_VARIABLE CUDA_ARCH_OUTPUT )
		execute_process( COMMAND ${CUDA_ARCH_EXECUTABLE}
		OUTPUT_VARIABLE CUDA_ARCH )
		if( NOT CUDA_ARCH_RESULT )
		# strip linebreaks and convert to list delimited with ';'
		string( REGEX REPLACE "[\n ]" ";" CUDA_ARCH ${CUDA_ARCH} )
		# cache the result
		set( CUDA_ARCH ${CUDA_ARCH} CACHE LIST "GPU architecture options" )
		else()
		message( "Failed to detect GPU architecture:\n${CUDA_ARCH_OUTPUT}" )
		message( "Using (almost) all GPU architectures as fallback." )
		set( CUDA_ARCH ${ALL_CUDA_ARCHS} )
		endif()
		message( "GPU architecture options: ${CUDA_ARCH}" )
		else()
		set( CUDA_ARCH -gencode arch=compute_${WITH_CUDA_ARCH},code=sm_${WITH_CUDA_ARCH} )
		endif()
		endif()
		set( CUDA_ADD_EXECUTABLE_OPTIONS ${CUDA_ARCH} )
		set( CUDA_ADD_LIBRARY_OPTIONS ${CUDA_ARCH} -shared )
		set( CUDA_LINKER_OPTIONS "-arch sm_20 -shared " )


		####
		# Check for cuBLAS
		#

build

+6 −1

Original line number	Diff line number	Diff line
		@@ -5,6 +5,7 @@ PREFIX=${HOME}/local
		WITH_CUDA="yes"
		WITH_TESTS="yes"

		WITH_CUDA_ARCH="auto"
		WITH_CUBLAS="no"
		WITH_TEMPLATE_INSTANTIATION="yes"
		INSTANTIATE_LONG_INT="yes"
		@@ -27,6 +28,7 @@ do
		--with-tests=* ) WITH_TESTS="${option#*=}" ;;
		--with-cuda=* ) WITH_CUDA="${option#*=}" ;;
		--with-cublas=* ) WITH_CUBLAS="${option#*=}" ;;
		--with-cuda-arch=* ) WITH_CUDA_ARCH="${option#*=}";;
		--with-templates-instantiation ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;;
		--instantiate-long-int=* ) INSTANTIATE_LONG_INT="${option#*=}" ;;
		--instantiate-int=* ) INSTANTIATE_INT="${option#*=}" ;;
		@@ -37,7 +39,8 @@ do
		INSTANTIATE_INT="yes"
		INSTANTIATE_LONG_DOUBLE="no"
		INSTANTIATE_DOUBLE="yes"
		INSTANTIATE_FLOAT="no" ;;
		INSTANTIATE_FLOAT="no"
		WITH_CUDA_ARCH="auto" ;;
		--with-cmake=* ) CMAKE="${option#*=}" ;;
		--build-jobs=* ) BUILD_JOBS="${option#*=}" ;;
		--cmake-only=* ) CMAKE_ONLY="${option#*=}" ;;
		@@ -58,6 +61,7 @@ then
		echo " --build=Debug/Release Build type."
		echo " --with-tests=yes/no Enable unit tests. 'yes' by default (libcppunit-dev is required)."
		echo " --with-cuda=yes/no Enable CUDA. 'yes' by default (CUDA Toolkit is required)."
		echo " --with-cuda-arch=all/auto/30/35/... Choose CUDA architecture."
		echo " --with-templates-instantiation=yes/no Some TNL templates are precompiled during the build. 'yes' by default."
		echo " --with-cmake=CMAKE Path to cmake. 'cmake' by default."
		echo " --build-jobs=NUM Number of processes to be used for the build. It is set to a number of CPU cores by default."
		@@ -73,6 +77,7 @@ ${CMAKE} ${ROOT_DIR} \
		-DCMAKE_BUILD_TYPE=${BUILD} \
		-DCMAKE_INSTALL_PREFIX=${PREFIX} \
		-DWITH_CUDA=${WITH_CUDA} \
		-DWITH_CUDA_ARCH=${WITH_CUDA_ARCH}
		-DWITH_CUBLAS=${WITH_CUBLAS} \
		-DWITH_TESTS=${WITH_TESTS} \
		-DPETSC_DIR=${PETSC_DIR} \

src/core/cuda/cuda-reduction_impl.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -47,7 +47,7 @@ template< typename Operation >
		__device__ void reduceAligned( const Operation& operation,
		typename Operation :: IndexType tid,
		typename Operation :: IndexType s,
		typename Operation :: ResultType* sdata )
		volatile typename Operation :: ResultType* sdata )
		{
		if( tid < s )
		{
		@@ -67,7 +67,7 @@ __device__ void reduceNonAligned( const Operation& operation,
		typename Operation :: IndexType tid,
		typename Operation :: IndexType s,
		typename Operation :: IndexType n,
		typename Operation :: ResultType* sdata )
		volatile typename Operation :: ResultType* sdata )
		{
		if( tid < s )
		{

src/core/cuda/reduction-operations.h

+98 −21

Original line number	Diff line number	Diff line
		@@ -50,6 +50,31 @@ __device__ inline double tnlCudaMin( const double& a,
		return fmin( a, b );
		}

		template< class T > __device__ T tnlCudaMin( volatile const T& a,
		volatile const T& b )
		{
		return a < b ? a : b;
		}

		__device__ inline int tnlCudaMin( volatile const int& a,
		volatile const int& b )
		{
		return min( a, b );
		}

		__device__ inline float tnlCudaMin( volatile const float& a,
		volatile const float& b )
		{
		return fminf( a, b );
		}

		__device__ inline double tnlCudaMin( volatile const double& a,
		volatile const double& b )
		{
		return fmin( a, b );
		}


		/***
		* This function returns maximum of two numbers stored on the device.
		*/
		@@ -77,6 +102,30 @@ __device__ inline double tnlCudaMax( const double& a,
		return fmax( a, b );
		}

		template< class T > __device__ T tnlCudaMax( volatile const T& a,
		volatile const T& b )
		{
		return a > b ? a : b;
		}

		__device__ inline int tnlCudaMax( volatile const int& a,
		volatile const int& b )
		{
		return max( a, b );
		}

		__device__ inline float tnlCudaMax( volatile const float& a,
		volatile const float& b )
		{
		return fmaxf( a, b );
		}

		__device__ inline double tnlCudaMax( volatile const double& a,
		volatile const double& b )
		{
		return fmax( a, b );
		}

		/***
		* This function returns absolute value of given number on the device.
		*/
		@@ -105,6 +154,32 @@ __device__ inline long double tnlCudaAbs( const long double& a )
		return fabs( ( double ) a );
		}

		__device__ inline int tnlCudaAbs( volatile const int& a )
		{
		return abs( a );
		}

		__device__ inline long int tnlCudaAbs( volatile const long int& a )
		{
		return abs( a );
		}

		__device__ inline float tnlCudaAbs( volatile const float& a )
		{
		return fabs( a );
		}

		__device__ inline double tnlCudaAbs( volatile const double& a )
		{
		return fabs( a );
		}

		__device__ inline long double tnlCudaAbs( volatile const long double& a )
		{
		return fabs( ( double ) a );
		}


		template< typename Type1, typename Type2 >
		__device__ Type1 tnlCudaPow( const Type1& x, const Type2& power )
		{
		@@ -173,7 +248,7 @@ class tnlParallelReductionSum

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return data[ idx1 ] + data[ idx2 ];
		};
		@@ -261,7 +336,7 @@ class tnlParallelReductionMin

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
		};
		@@ -330,7 +405,7 @@ class tnlParallelReductionMax

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
		};
		@@ -399,7 +474,7 @@ class tnlParallelReductionAbsSum

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return data[ idx1 ] + data[ idx2 ];
		};
		@@ -468,9 +543,10 @@ class tnlParallelReductionAbsMin

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
		volatile ResultType aux = tnlCudaAbs( data[ idx2 ] );
		return tnlCudaMin( data[ idx1 ], aux );
		};
		#endif
		};
		@@ -537,9 +613,10 @@ class tnlParallelReductionAbsMax

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
		volatile ResultType aux = tnlCudaAbs( data[ idx2 ] );
		return tnlCudaMax( data[ idx1 ], aux );
		};
		#endif
		};
		@@ -606,7 +683,7 @@ class tnlParallelReductionLogicalAnd

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return data[ idx1 ] && data[ idx2 ];
		};
		@@ -676,7 +753,7 @@ class tnlParallelReductionLogicalOr

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return data[ idx1 ] \|\| data[ idx2 ];
		};
		@@ -752,7 +829,7 @@ class tnlParallelReductionLpNorm

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return data[ idx1 ] + data[ idx2 ];
		};
		@@ -827,7 +904,7 @@ class tnlParallelReductionEqualities

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return data[ idx1 ] && data[ idx2 ];
		};
		@@ -898,7 +975,7 @@ class tnlParallelReductionInequalities

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return data[ idx1 ] && data[ idx2 ];
		};
		@@ -969,7 +1046,7 @@ class tnlParallelReductionScalarProduct

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return data[ idx1 ] + data[ idx2 ];
		};
		@@ -1039,7 +1116,7 @@ class tnlParallelReductionDiffSum

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return data[ idx1 ] + data[ idx2 ];
		};
		@@ -1110,7 +1187,7 @@ class tnlParallelReductionDiffMin

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
		};
		@@ -1182,7 +1259,7 @@ class tnlParallelReductionDiffMax

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
		};
		@@ -1254,7 +1331,7 @@ class tnlParallelReductionDiffAbsSum

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return data[ idx1 ] + data[ idx2 ];
		};
		@@ -1327,7 +1404,7 @@ class tnlParallelReductionDiffAbsMin

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		//return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
		return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
		@@ -1401,7 +1478,7 @@ class tnlParallelReductionDiffAbsMax

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		//return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
		return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
		@@ -1479,7 +1556,7 @@ class tnlParallelReductionDiffLpNorm

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const ResultType* data ) const
		volatile const ResultType* data ) const
		{
		return data[ idx1 ] + data[ idx2 ];
		};

src/matrices/tnlCSRMatrix_impl.h

+1 −3

Original line number	Diff line number	Diff line
		@@ -648,7 +648,7 @@ void tnlCSRMatrix< Real, Device, Index >::spmvCudaVectorized( const InVector& in
		const IndexType warpEnd,
		const IndexType inWarpIdx ) const
		{
		Real* aux = getSharedMemory< Real >();
		volatile Real* aux = getSharedMemory< Real >();
		for( IndexType row = warpStart; row < warpEnd; row++ )
		{
		aux[ threadIdx.x ] = 0.0;
		@@ -672,8 +672,6 @@ void tnlCSRMatrix< Real, Device, Index >::spmvCudaVectorized( const InVector& in
		if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ];
		if( warpSize >= 2 )
		if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ];
		__syncthreads(); // TODO: I am not sure why - aux must be volatile

		if( inWarpIdx == 0 )
		outVector[ row ] = aux[ threadIdx.x ];
		}