Loading build +26 −25 Original line number Diff line number Diff line Loading @@ -29,17 +29,18 @@ do --with-cuda=* ) WITH_CUDA="${option#*=}" ;; --with-cublas=* ) WITH_CUBLAS="${option#*=}" ;; --with-cuda-arch=* ) WITH_CUDA_ARCH="${option#*=}";; --with-templates-instantiation ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;; --with-templates-instantiation=* ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;; --instantiate-long-int=* ) INSTANTIATE_LONG_INT="${option#*=}" ;; --instantiate-int=* ) INSTANTIATE_INT="${option#*=}" ;; --instantiate-long-double=* ) INSTANTIATE_LONG_DOUBLE="${option#*=}" ;; --instantiate-double=* ) INSTANTIATE_DOUBLE="${option#*=}" ;; --instantiate-float=* ) INSTANTIATE_FLOAT="${option#*=}" ;; --full-build ) INSTANTIATE_LONG_INT="yes" --fast-build ) INSTANTIATE_LONG_INT="no" INSTANTIATE_INT="yes" INSTANTIATE_LONG_DOUBLE="yes" INSTANTIATE_LONG_DOUBLE="no" INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="yes";; INSTANTIATE_FLOAT="no" WITH_CUDA_ARCH="auto" ;; --with-cmake=* ) CMAKE="${option#*=}" ;; --build-jobs=* ) BUILD_JOBS="${option#*=}" ;; --cmake-only=* ) CMAKE_ONLY="${option#*=}" ;; Loading src/core/arrays/tnlArrayOperationsCuda_impl.h +5 −2 Original line number Diff line number Diff line Loading @@ -256,6 +256,9 @@ bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory( const Element1* dest const Element2* source, const Index size ) { /*** * Here, destination is on host and source is on CUDA device. */ tnlAssert( destination, ); tnlAssert( source, ); tnlAssert( size >= 0, cerr << "size = " << size ); Loading @@ -280,7 +283,7 @@ bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory( const Element1* dest delete[] host_buffer; return false; } if( ! tnlArrayOperations< tnlHost >::compareMemory( host_buffer, destination, transfer ) ) if( ! tnlArrayOperations< tnlHost >::compareMemory( &destination[ compared ], host_buffer, transfer ) ) { delete[] host_buffer; return false; Loading src/core/cuda/CMakeLists.txt +3 −1 Original line number Diff line number Diff line Loading @@ -20,6 +20,7 @@ IF( BUILD_CUDA ) ${CURRENT_DIR}/cuda-reduction-abs-max_impl.cu ${CURRENT_DIR}/cuda-reduction-and_impl.cu ${CURRENT_DIR}/cuda-reduction-or_impl.cu ${CURRENT_DIR}/cuda-reduction-l2-norm_impl.cu ${CURRENT_DIR}/cuda-reduction-lp-norm_impl.cu ${CURRENT_DIR}/cuda-reduction-equalities_impl.cu ${CURRENT_DIR}/cuda-reduction-inequalities_impl.cu Loading @@ -30,6 +31,7 @@ IF( BUILD_CUDA ) ${CURRENT_DIR}/cuda-reduction-diff-abs-sum_impl.cu ${CURRENT_DIR}/cuda-reduction-diff-abs-min_impl.cu ${CURRENT_DIR}/cuda-reduction-diff-abs-max_impl.cu ${CURRENT_DIR}/cuda-reduction-diff-l2-norm_impl.cu ${CURRENT_DIR}/cuda-reduction-diff-lp-norm_impl.cu ${CURRENT_DIR}/cuda-prefix-sum_impl.cu PARENT_SCOPE ) Loading src/core/cuda/cuda-prefix-sum_impl.cu +8 −8 Original line number Diff line number Diff line Loading @@ -23,7 +23,7 @@ template bool cudaPrefixSum( const int size, const int blockSize, const int *deviceInput, int* deviceOutput, const tnlParallelReductionSum< int, int >& operation, tnlParallelReductionSum< int, int >& operation, const enumPrefixSumType prefixSumType ); Loading @@ -32,7 +32,7 @@ template bool cudaPrefixSum( const int size, const int blockSize, const float *deviceInput, float* deviceOutput, const tnlParallelReductionSum< float, int >& operation, tnlParallelReductionSum< float, int >& operation, const enumPrefixSumType prefixSumType ); #endif Loading @@ -40,7 +40,7 @@ template bool cudaPrefixSum( const int size, const int blockSize, const double *deviceInput, double* deviceOutput, const tnlParallelReductionSum< double, int >& operation, tnlParallelReductionSum< double, int >& operation, const enumPrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE Loading @@ -48,7 +48,7 @@ template bool cudaPrefixSum( const int size, const int blockSize, const long double *deviceInput, long double* deviceOutput, const tnlParallelReductionSum< long double, int >& operation, tnlParallelReductionSum< long double, int >& operation, const enumPrefixSumType prefixSumType ); #endif Loading @@ -57,7 +57,7 @@ template bool cudaPrefixSum( const long int size, const long int blockSize, const int *deviceInput, int* deviceOutput, const tnlParallelReductionSum< int, long int >& operation, tnlParallelReductionSum< int, long int >& operation, const enumPrefixSumType prefixSumType ); Loading @@ -66,7 +66,7 @@ template bool cudaPrefixSum( const long int size, const long int blockSize, const float *deviceInput, float* deviceOutput, const tnlParallelReductionSum< float, long int >& operation, tnlParallelReductionSum< float, long int >& operation, const enumPrefixSumType prefixSumType ); #endif Loading @@ -74,7 +74,7 @@ template bool cudaPrefixSum( const long int size, const long int blockSize, const double *deviceInput, double* deviceOutput, const tnlParallelReductionSum< double, long int >& operation, tnlParallelReductionSum< double, long int >& operation, const enumPrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE Loading @@ -82,7 +82,7 @@ template bool cudaPrefixSum( const long int size, const long int blockSize, const long double *deviceInput, long double* deviceOutput, const tnlParallelReductionSum< long double, long int >& operation, tnlParallelReductionSum< long double, long int >& operation, const enumPrefixSumType prefixSumType ); #endif #endif Loading src/core/cuda/cuda-prefix-sum_impl.h +13 −13 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ template< typename DataType, typename Operation, typename Index > __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumType, const Operation operation, Operation operation, const Index size, const Index elementsInBlock, const DataType* input, Loading Loading @@ -159,7 +159,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT template< typename DataType, typename Operation, typename Index > __global__ void cudaSecondPhaseBlockPrefixSum( const Operation operation, __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation, const Index size, const Index elementsInBlock, const Index gridShift, Loading @@ -186,7 +186,7 @@ template< typename DataType, typename Operation, typename Index > bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType, const Operation& operation, Operation& operation, const Index size, const Index blockSize, const Index elementsInBlock, Loading Loading @@ -276,7 +276,7 @@ template< typename DataType, typename Operation, typename Index > bool cudaGridPrefixSum( enumPrefixSumType prefixSumType, const Operation& operation, Operation& operation, const Index size, const Index blockSize, const Index elementsInBlock, Loading Loading @@ -313,7 +313,7 @@ bool cudaPrefixSum( const Index size, const Index blockSize, const DataType *deviceInput, DataType* deviceOutput, const Operation& operation, Operation& operation, const enumPrefixSumType prefixSumType ) { /**** Loading Loading @@ -361,7 +361,7 @@ extern template bool cudaPrefixSum( const int size, const int blockSize, const int *deviceInput, int* deviceOutput, const tnlParallelReductionSum< int, int >& operation, tnlParallelReductionSum< int, int >& operation, const enumPrefixSumType prefixSumType ); Loading @@ -369,14 +369,14 @@ extern template bool cudaPrefixSum( const int size, const int blockSize, const float *deviceInput, float* deviceOutput, const tnlParallelReductionSum< float, int >& operation, tnlParallelReductionSum< float, int >& operation, const enumPrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const int size, const int blockSize, const double *deviceInput, double* deviceOutput, const tnlParallelReductionSum< double, int >& operation, tnlParallelReductionSum< double, int >& operation, const enumPrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE Loading @@ -384,7 +384,7 @@ extern template bool cudaPrefixSum( const int size, const int blockSize, const long double *deviceInput, long double* deviceOutput, const tnlParallelReductionSum< long double, int >& operation, tnlParallelReductionSum< long double, int >& operation, const enumPrefixSumType prefixSumType ); #endif Loading @@ -393,7 +393,7 @@ extern template bool cudaPrefixSum( const long int size, const long int blockSize, const int *deviceInput, int* deviceOutput, const tnlParallelReductionSum< int, long int >& operation, tnlParallelReductionSum< int, long int >& operation, const enumPrefixSumType prefixSumType ); Loading @@ -401,14 +401,14 @@ extern template bool cudaPrefixSum( const long int size, const long int blockSize, const float *deviceInput, float* deviceOutput, const tnlParallelReductionSum< float, long int >& operation, tnlParallelReductionSum< float, long int >& operation, const enumPrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const long int size, const long int blockSize, const double *deviceInput, double* deviceOutput, const tnlParallelReductionSum< double, long int >& operation, tnlParallelReductionSum< double, long int >& operation, const enumPrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE Loading @@ -416,7 +416,7 @@ extern template bool cudaPrefixSum( const long int size, const long int blockSize, const long double *deviceInput, long double* deviceOutput, const tnlParallelReductionSum< long double, long int >& operation, tnlParallelReductionSum< long double, long int >& operation, const enumPrefixSumType prefixSumType ); #endif #endif Loading Loading
build +26 −25 Original line number Diff line number Diff line Loading @@ -29,17 +29,18 @@ do --with-cuda=* ) WITH_CUDA="${option#*=}" ;; --with-cublas=* ) WITH_CUBLAS="${option#*=}" ;; --with-cuda-arch=* ) WITH_CUDA_ARCH="${option#*=}";; --with-templates-instantiation ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;; --with-templates-instantiation=* ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;; --instantiate-long-int=* ) INSTANTIATE_LONG_INT="${option#*=}" ;; --instantiate-int=* ) INSTANTIATE_INT="${option#*=}" ;; --instantiate-long-double=* ) INSTANTIATE_LONG_DOUBLE="${option#*=}" ;; --instantiate-double=* ) INSTANTIATE_DOUBLE="${option#*=}" ;; --instantiate-float=* ) INSTANTIATE_FLOAT="${option#*=}" ;; --full-build ) INSTANTIATE_LONG_INT="yes" --fast-build ) INSTANTIATE_LONG_INT="no" INSTANTIATE_INT="yes" INSTANTIATE_LONG_DOUBLE="yes" INSTANTIATE_LONG_DOUBLE="no" INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="yes";; INSTANTIATE_FLOAT="no" WITH_CUDA_ARCH="auto" ;; --with-cmake=* ) CMAKE="${option#*=}" ;; --build-jobs=* ) BUILD_JOBS="${option#*=}" ;; --cmake-only=* ) CMAKE_ONLY="${option#*=}" ;; Loading
src/core/arrays/tnlArrayOperationsCuda_impl.h +5 −2 Original line number Diff line number Diff line Loading @@ -256,6 +256,9 @@ bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory( const Element1* dest const Element2* source, const Index size ) { /*** * Here, destination is on host and source is on CUDA device. */ tnlAssert( destination, ); tnlAssert( source, ); tnlAssert( size >= 0, cerr << "size = " << size ); Loading @@ -280,7 +283,7 @@ bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory( const Element1* dest delete[] host_buffer; return false; } if( ! tnlArrayOperations< tnlHost >::compareMemory( host_buffer, destination, transfer ) ) if( ! tnlArrayOperations< tnlHost >::compareMemory( &destination[ compared ], host_buffer, transfer ) ) { delete[] host_buffer; return false; Loading
src/core/cuda/CMakeLists.txt +3 −1 Original line number Diff line number Diff line Loading @@ -20,6 +20,7 @@ IF( BUILD_CUDA ) ${CURRENT_DIR}/cuda-reduction-abs-max_impl.cu ${CURRENT_DIR}/cuda-reduction-and_impl.cu ${CURRENT_DIR}/cuda-reduction-or_impl.cu ${CURRENT_DIR}/cuda-reduction-l2-norm_impl.cu ${CURRENT_DIR}/cuda-reduction-lp-norm_impl.cu ${CURRENT_DIR}/cuda-reduction-equalities_impl.cu ${CURRENT_DIR}/cuda-reduction-inequalities_impl.cu Loading @@ -30,6 +31,7 @@ IF( BUILD_CUDA ) ${CURRENT_DIR}/cuda-reduction-diff-abs-sum_impl.cu ${CURRENT_DIR}/cuda-reduction-diff-abs-min_impl.cu ${CURRENT_DIR}/cuda-reduction-diff-abs-max_impl.cu ${CURRENT_DIR}/cuda-reduction-diff-l2-norm_impl.cu ${CURRENT_DIR}/cuda-reduction-diff-lp-norm_impl.cu ${CURRENT_DIR}/cuda-prefix-sum_impl.cu PARENT_SCOPE ) Loading
src/core/cuda/cuda-prefix-sum_impl.cu +8 −8 Original line number Diff line number Diff line Loading @@ -23,7 +23,7 @@ template bool cudaPrefixSum( const int size, const int blockSize, const int *deviceInput, int* deviceOutput, const tnlParallelReductionSum< int, int >& operation, tnlParallelReductionSum< int, int >& operation, const enumPrefixSumType prefixSumType ); Loading @@ -32,7 +32,7 @@ template bool cudaPrefixSum( const int size, const int blockSize, const float *deviceInput, float* deviceOutput, const tnlParallelReductionSum< float, int >& operation, tnlParallelReductionSum< float, int >& operation, const enumPrefixSumType prefixSumType ); #endif Loading @@ -40,7 +40,7 @@ template bool cudaPrefixSum( const int size, const int blockSize, const double *deviceInput, double* deviceOutput, const tnlParallelReductionSum< double, int >& operation, tnlParallelReductionSum< double, int >& operation, const enumPrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE Loading @@ -48,7 +48,7 @@ template bool cudaPrefixSum( const int size, const int blockSize, const long double *deviceInput, long double* deviceOutput, const tnlParallelReductionSum< long double, int >& operation, tnlParallelReductionSum< long double, int >& operation, const enumPrefixSumType prefixSumType ); #endif Loading @@ -57,7 +57,7 @@ template bool cudaPrefixSum( const long int size, const long int blockSize, const int *deviceInput, int* deviceOutput, const tnlParallelReductionSum< int, long int >& operation, tnlParallelReductionSum< int, long int >& operation, const enumPrefixSumType prefixSumType ); Loading @@ -66,7 +66,7 @@ template bool cudaPrefixSum( const long int size, const long int blockSize, const float *deviceInput, float* deviceOutput, const tnlParallelReductionSum< float, long int >& operation, tnlParallelReductionSum< float, long int >& operation, const enumPrefixSumType prefixSumType ); #endif Loading @@ -74,7 +74,7 @@ template bool cudaPrefixSum( const long int size, const long int blockSize, const double *deviceInput, double* deviceOutput, const tnlParallelReductionSum< double, long int >& operation, tnlParallelReductionSum< double, long int >& operation, const enumPrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE Loading @@ -82,7 +82,7 @@ template bool cudaPrefixSum( const long int size, const long int blockSize, const long double *deviceInput, long double* deviceOutput, const tnlParallelReductionSum< long double, long int >& operation, tnlParallelReductionSum< long double, long int >& operation, const enumPrefixSumType prefixSumType ); #endif #endif Loading
src/core/cuda/cuda-prefix-sum_impl.h +13 −13 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ template< typename DataType, typename Operation, typename Index > __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumType, const Operation operation, Operation operation, const Index size, const Index elementsInBlock, const DataType* input, Loading Loading @@ -159,7 +159,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT template< typename DataType, typename Operation, typename Index > __global__ void cudaSecondPhaseBlockPrefixSum( const Operation operation, __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation, const Index size, const Index elementsInBlock, const Index gridShift, Loading @@ -186,7 +186,7 @@ template< typename DataType, typename Operation, typename Index > bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType, const Operation& operation, Operation& operation, const Index size, const Index blockSize, const Index elementsInBlock, Loading Loading @@ -276,7 +276,7 @@ template< typename DataType, typename Operation, typename Index > bool cudaGridPrefixSum( enumPrefixSumType prefixSumType, const Operation& operation, Operation& operation, const Index size, const Index blockSize, const Index elementsInBlock, Loading Loading @@ -313,7 +313,7 @@ bool cudaPrefixSum( const Index size, const Index blockSize, const DataType *deviceInput, DataType* deviceOutput, const Operation& operation, Operation& operation, const enumPrefixSumType prefixSumType ) { /**** Loading Loading @@ -361,7 +361,7 @@ extern template bool cudaPrefixSum( const int size, const int blockSize, const int *deviceInput, int* deviceOutput, const tnlParallelReductionSum< int, int >& operation, tnlParallelReductionSum< int, int >& operation, const enumPrefixSumType prefixSumType ); Loading @@ -369,14 +369,14 @@ extern template bool cudaPrefixSum( const int size, const int blockSize, const float *deviceInput, float* deviceOutput, const tnlParallelReductionSum< float, int >& operation, tnlParallelReductionSum< float, int >& operation, const enumPrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const int size, const int blockSize, const double *deviceInput, double* deviceOutput, const tnlParallelReductionSum< double, int >& operation, tnlParallelReductionSum< double, int >& operation, const enumPrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE Loading @@ -384,7 +384,7 @@ extern template bool cudaPrefixSum( const int size, const int blockSize, const long double *deviceInput, long double* deviceOutput, const tnlParallelReductionSum< long double, int >& operation, tnlParallelReductionSum< long double, int >& operation, const enumPrefixSumType prefixSumType ); #endif Loading @@ -393,7 +393,7 @@ extern template bool cudaPrefixSum( const long int size, const long int blockSize, const int *deviceInput, int* deviceOutput, const tnlParallelReductionSum< int, long int >& operation, tnlParallelReductionSum< int, long int >& operation, const enumPrefixSumType prefixSumType ); Loading @@ -401,14 +401,14 @@ extern template bool cudaPrefixSum( const long int size, const long int blockSize, const float *deviceInput, float* deviceOutput, const tnlParallelReductionSum< float, long int >& operation, tnlParallelReductionSum< float, long int >& operation, const enumPrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const long int size, const long int blockSize, const double *deviceInput, double* deviceOutput, const tnlParallelReductionSum< double, long int >& operation, tnlParallelReductionSum< double, long int >& operation, const enumPrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE Loading @@ -416,7 +416,7 @@ extern template bool cudaPrefixSum( const long int size, const long int blockSize, const long double *deviceInput, long double* deviceOutput, const tnlParallelReductionSum< long double, long int >& operation, tnlParallelReductionSum< long double, long int >& operation, const enumPrefixSumType prefixSumType ); #endif #endif Loading