Loading CMakeLists.txt +5 −6 Original line number Diff line number Diff line Loading @@ -33,8 +33,7 @@ if( WITH_TEMPLATE_EXPLICIT_INSTANTIATION STREQUAL "yes" ) endif() if( WITH_CUDA STREQUAL "yes" ) #AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128 --relocatable-device-code=yes --device-c" ) AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128" ) AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128 -shared" ) else() AddCompilerFlag( "-std=gnu++0x" ) endif() Loading @@ -50,13 +49,13 @@ if( NOT WITH_CUDA STREQUAL "no" ) set(BUILD_SHARED_LIBS ON) set(CUDA_SEPARABLE_COMPILATION ON) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DHAVE_CUDA;--shared;--compiler-options -fPIC) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DHAVE_CUDA;--compiler-options '-fPIC','-shared') if( CUDA_ARCHITECTURE STREQUAL "2.0" ) #set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=compute_20;-code=sm_20;-DCUDA_ARCH=20) #set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch sm_20;-DCUDA_ARCH=20) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DCUDA_ARCH=20) endif() if( CUDA_ARCHITECTURE STREQUAL "2.1" ) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_21;-DCUDA_ARCH=21) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=compute_20;-code=sm_21;-DCUDA_ARCH=21) endif() if( CUDA_ARCHITECTURE STREQUAL "2.2" ) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_22;-DCUDA_ARCH=22) Loading @@ -65,7 +64,7 @@ if( NOT WITH_CUDA STREQUAL "no" ) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_23;-DCUDA_ARCH=23) endif() if( CUDA_ARCHITECTURE STREQUAL "3.0" ) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_30;-DCUDA_ARCH=30) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=compute_30;-code=sm_30;-DCUDA_ARCH=30) endif() #### Loading install +12 −10 Original line number Diff line number Diff line Loading @@ -10,6 +10,8 @@ VERBOSE="VERBOSE=1" CMAKE="cmake" CPUS=`grep -c processor /proc/cpuinfo` CPUS="1" echo "Building $TARGET using $CPUS processors." Loading @@ -34,14 +36,14 @@ make -j${CPUS} ${VERBOSE} make -j${CPUS} test make -j${CPUS} install cd ../Release ${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local \ -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} \ -DWITH_CUDA=${WITH_CUDA} \ -DWITH_CUSPARSE=${WITH_CUSPARSE} \ -DPETSC_DIR=${PETSC_DIR} \ -DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION} make -j${CPUS} ${VERBOSE} make -j${CPUS} test make -j${CPUS} install #cd ../Release #${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local \ # -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} \ # -DWITH_CUDA=${WITH_CUDA} \ # -DWITH_CUSPARSE=${WITH_CUSPARSE} \ # -DPETSC_DIR=${PETSC_DIR} \ # -DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION} #make -j${CPUS} ${VERBOSE} #make -j${CPUS} test #make -j${CPUS} install src/CMakeLists.txt +3 −2 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ set( tnl_CUDA__SOURCES ${tnl_generators_CUDA__SOURCES} if( BUILD_CUDA ) CUDA_ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED ${tnl_CUDA__SOURCES} OPTIONS -arch sm_20 ) OPTIONS -arch sm_20 -shared --compiler-options '-fPIC','-shared' ) else( BUILD_CUDA ) ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED ${tnl_SOURCES} ) Loading @@ -47,7 +47,8 @@ IF( BUILD_MPI ) if( BUILD_CUDA ) CUDA_ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED ${tnl_CUDA__SOURCES} OPTIONS -arch sm_20 ) OPTIONS -arch sm_20 -shared --compiler-options '-fPIC','-shared' ) #-arch sm_20 -shared --linker-options '-fPIC','-shared' else( BUILD_CUDA ) ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED ${tnl_SOURCES} ) Loading src/implementation/core/cuda/cuda-reduction-abs-max_impl.cu +1 −0 Original line number Diff line number Diff line Loading @@ -59,6 +59,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, in const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2, typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result ); template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > > ( const tnlParallelReductionAbsMax< char, long int >& operation, const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size, Loading tests/benchmarks/CMakeLists.txt +2 −1 Original line number Diff line number Diff line Loading @@ -10,7 +10,8 @@ SET( tnlSpmvBenchmark_headers sparse-matrix-benchmark.h IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-sparse-matrix-benchmark${debugExt} sparse-matrix-benchmark.cu ) CUDA_ADD_EXECUTABLE( tnl-sparse-matrix-benchmark${debugExt} sparse-matrix-benchmark.cu OPTIONS -arch sm_20 -shared ) SET_TARGET_PROPERTIES( tnl-sparse-matrix-benchmark${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" ) ELSE() ADD_EXECUTABLE( tnl-sparse-matrix-benchmark${debugExt} sparse-matrix-benchmark.cpp ) Loading Loading
CMakeLists.txt +5 −6 Original line number Diff line number Diff line Loading @@ -33,8 +33,7 @@ if( WITH_TEMPLATE_EXPLICIT_INSTANTIATION STREQUAL "yes" ) endif() if( WITH_CUDA STREQUAL "yes" ) #AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128 --relocatable-device-code=yes --device-c" ) AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128" ) AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128 -shared" ) else() AddCompilerFlag( "-std=gnu++0x" ) endif() Loading @@ -50,13 +49,13 @@ if( NOT WITH_CUDA STREQUAL "no" ) set(BUILD_SHARED_LIBS ON) set(CUDA_SEPARABLE_COMPILATION ON) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DHAVE_CUDA;--shared;--compiler-options -fPIC) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DHAVE_CUDA;--compiler-options '-fPIC','-shared') if( CUDA_ARCHITECTURE STREQUAL "2.0" ) #set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=compute_20;-code=sm_20;-DCUDA_ARCH=20) #set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch sm_20;-DCUDA_ARCH=20) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DCUDA_ARCH=20) endif() if( CUDA_ARCHITECTURE STREQUAL "2.1" ) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_21;-DCUDA_ARCH=21) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=compute_20;-code=sm_21;-DCUDA_ARCH=21) endif() if( CUDA_ARCHITECTURE STREQUAL "2.2" ) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_22;-DCUDA_ARCH=22) Loading @@ -65,7 +64,7 @@ if( NOT WITH_CUDA STREQUAL "no" ) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_23;-DCUDA_ARCH=23) endif() if( CUDA_ARCHITECTURE STREQUAL "3.0" ) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_30;-DCUDA_ARCH=30) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=compute_30;-code=sm_30;-DCUDA_ARCH=30) endif() #### Loading
install +12 −10 Original line number Diff line number Diff line Loading @@ -10,6 +10,8 @@ VERBOSE="VERBOSE=1" CMAKE="cmake" CPUS=`grep -c processor /proc/cpuinfo` CPUS="1" echo "Building $TARGET using $CPUS processors." Loading @@ -34,14 +36,14 @@ make -j${CPUS} ${VERBOSE} make -j${CPUS} test make -j${CPUS} install cd ../Release ${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local \ -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} \ -DWITH_CUDA=${WITH_CUDA} \ -DWITH_CUSPARSE=${WITH_CUSPARSE} \ -DPETSC_DIR=${PETSC_DIR} \ -DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION} make -j${CPUS} ${VERBOSE} make -j${CPUS} test make -j${CPUS} install #cd ../Release #${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local \ # -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} \ # -DWITH_CUDA=${WITH_CUDA} \ # -DWITH_CUSPARSE=${WITH_CUSPARSE} \ # -DPETSC_DIR=${PETSC_DIR} \ # -DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION} #make -j${CPUS} ${VERBOSE} #make -j${CPUS} test #make -j${CPUS} install
src/CMakeLists.txt +3 −2 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ set( tnl_CUDA__SOURCES ${tnl_generators_CUDA__SOURCES} if( BUILD_CUDA ) CUDA_ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED ${tnl_CUDA__SOURCES} OPTIONS -arch sm_20 ) OPTIONS -arch sm_20 -shared --compiler-options '-fPIC','-shared' ) else( BUILD_CUDA ) ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED ${tnl_SOURCES} ) Loading @@ -47,7 +47,8 @@ IF( BUILD_MPI ) if( BUILD_CUDA ) CUDA_ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED ${tnl_CUDA__SOURCES} OPTIONS -arch sm_20 ) OPTIONS -arch sm_20 -shared --compiler-options '-fPIC','-shared' ) #-arch sm_20 -shared --linker-options '-fPIC','-shared' else( BUILD_CUDA ) ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED ${tnl_SOURCES} ) Loading
src/implementation/core/cuda/cuda-reduction-abs-max_impl.cu +1 −0 Original line number Diff line number Diff line Loading @@ -59,6 +59,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, in const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2, typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result ); template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > > ( const tnlParallelReductionAbsMax< char, long int >& operation, const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size, Loading
tests/benchmarks/CMakeLists.txt +2 −1 Original line number Diff line number Diff line Loading @@ -10,7 +10,8 @@ SET( tnlSpmvBenchmark_headers sparse-matrix-benchmark.h IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-sparse-matrix-benchmark${debugExt} sparse-matrix-benchmark.cu ) CUDA_ADD_EXECUTABLE( tnl-sparse-matrix-benchmark${debugExt} sparse-matrix-benchmark.cu OPTIONS -arch sm_20 -shared ) SET_TARGET_PROPERTIES( tnl-sparse-matrix-benchmark${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" ) ELSE() ADD_EXECUTABLE( tnl-sparse-matrix-benchmark${debugExt} sparse-matrix-benchmark.cpp ) Loading