From 8b4397ceff15498a75d2457f2904f9fd8c788d1d Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz> Date: Sat, 23 Mar 2013 15:57:42 +0100 Subject: [PATCH] Refactoring the reduction on the CUDA device. --- buildAll | 4 +- src/config/CMakeLists.txt | 2 +- src/core/CMakeLists.txt | 3 +- src/core/cuda/CMakeLists.txt | 5 +- src/core/cuda/reduction-operations.h | 313 +++++++++++- src/implementation/core/CMakeLists.txt | 2 +- .../core/cuda-long-vector-kernels.h | 482 ++++++------------ src/implementation/core/cuda/CMakeLists.txt | 2 +- src/implementation/core/memory-operations.h | 18 +- src/legacy/core/CMakeLists.txt | 13 +- .../core/cuda/tnlCudaReductionTester.h | 129 ++++- 11 files changed, 597 insertions(+), 376 deletions(-) diff --git a/buildAll b/buildAll index 3318eab029..d553862bd5 100755 --- a/buildAll +++ b/buildAll @@ -25,10 +25,10 @@ cd Debug ${CMAKE} .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR} make -j${CPUS} #VERBOSE=1 make -j${CPUS} test -#make -j${CPUS} install +make -j${CPUS} install cd ../Release ${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR} make -j${CPUS} #VERBOSE=1 make -j${CPUS} test -#make -j${CPUS} install +make -j${CPUS} install diff --git a/src/config/CMakeLists.txt b/src/config/CMakeLists.txt index f745e98a4c..a1fdbf9582 100755 --- a/src/config/CMakeLists.txt +++ b/src/config/CMakeLists.txt @@ -1,5 +1,5 @@ SET( EXTRA_DIST tnlConfigDescriptionParser.y - tnlConfigDescriptionScanner.l + tnlConfigDescriptionScanner.l ) SET( headers tnlConfigDescription.h diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index badb8ca45a..04f786f263 100755 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -23,8 +23,7 @@ set (headers tnlArray.h tnlReal.h tnlTimerCPU.h tnlTimerRT.h - tnlTuple.h - tnlCudaSupport.h + tnlTuple.h mfilename.h mfuncs.h mpi-supp.h diff --git a/src/core/cuda/CMakeLists.txt b/src/core/cuda/CMakeLists.txt index af2301ff76..3290b863cf 100755 --- a/src/core/cuda/CMakeLists.txt +++ b/src/core/cuda/CMakeLists.txt @@ -1,2 +1,5 @@ set( headers device-check.h - ) + reduction.h + reduction-operations.h ) + +INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/core/cuda ) \ No newline at end of file diff --git a/src/core/cuda/reduction-operations.h b/src/core/cuda/reduction-operations.h index de7488f586..9a6d8a0c4a 100644 --- a/src/core/cuda/reduction-operations.h +++ b/src/core/cuda/reduction-operations.h @@ -22,11 +22,7 @@ #include <cuda.h> #include <core/mfuncs.h> -enum tnlTupleOperation { tnlParallelReductionMax, - tnlParallelReductionAbsMin, - tnlParallelReductionAbsMax, - tnlParallelReductionAbsSum, - tnlParallelReductionLpNorm, +enum tnlTupleOperation { tnlParallelReductionLpNorm, tnlParallelReductionSdot }; @@ -109,20 +105,159 @@ class tnlParallelReductionSum typedef Real RealType; typedef Index IndexType; + typedef Real ResultType; + ResultType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx ]; + }; + + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const + { + return current + data1[ idx ]; + }; + + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx1 ] + data1[ idx2 ]; + }; + + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx1 ]; + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const RealType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ]; + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] + data2[ idx2 ]; + }; + + __device__ ResultType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data ) const + { + return data[ idx1 ] + data[ idx2 ]; + }; +}; + +template< typename Real, typename Index > +class tnlParallelReductionMin +{ + public: + + typedef Real RealType; + typedef Index IndexType; + + RealType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx ]; + }; - RealType reduceOnHost( const RealType& data1, - const RealType& data2 ) const + RealType reduceOnHost( const IndexType idx, + const RealType& current, + const RealType* data1, + const RealType* data2 ) const { - return data1 + data2; + return Min( current, data1[ idx ] ); + }; + + __device__ RealType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const + { + return tnlCudaMin( data1[ idx1 ], data1[ idx2 ] ); } + __device__ RealType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx1 ]; + }; + + __device__ RealType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const RealType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return tnlCudaMin( data1[ idx1 ], tnlCudaMin( data2[ idx2 ], data2[ idx3 ] ) ); + }; + + __device__ RealType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] ); + }; + + __device__ RealType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data ) const + { + return tnlCudaMin( data[ idx1 ], data[ idx2 ] ); + }; +}; + +template< typename Real, typename Index > +class tnlParallelReductionMax +{ + public: + + typedef Real RealType; + typedef Index IndexType; + + RealType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx ]; + }; + + RealType reduceOnHost( const IndexType idx, + const RealType& current, + const RealType* data1, + const RealType* data2 ) const + { + return Max( current, data1[ idx ] ); + }; + __device__ RealType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2 ) const { - return data1[ idx1 ] + data1[ idx2 ]; + return tnlCudaMax( data1[ idx1 ], data1[ idx2 ] ); } __device__ RealType initialValueOnDevice( const IndexType idx1, @@ -139,7 +274,7 @@ class tnlParallelReductionSum const RealType* data2, const RealType* data3 ) const { - return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ]; + return tnlCudaMax( data1[ idx1 ], tnlCudaMax( data2[ idx2 ], data2[ idx3 ] ) ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, @@ -148,7 +283,72 @@ class tnlParallelReductionSum const RealType* data2, const RealType* data3 ) const { - return data1[ idx1 ] + data2[ idx2 ]; + return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] ); + }; + + __device__ RealType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data ) const + { + return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); + }; +}; + +template< typename Real, typename Index > +class tnlParallelReductionAbsSum +{ + public: + + typedef Real RealType; + typedef Index IndexType; + + RealType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const + { + return tnlAbs( data1[ idx ] ); + }; + + RealType reduceOnHost( const IndexType idx, + const RealType& current, + const RealType* data1, + const RealType* data2 ) const + { + return current + tnlAbs( data1[ idx ] ); + }; + + __device__ RealType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const + { + return tnlCudaAbs( data1[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] ); + }; + + __device__ RealType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const + { + return tnlCudaAbs( data1[ idx1 ] ); + }; + + __device__ RealType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const RealType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ) + tnlCudaAbs( data2[ idx3 ] ); + }; + + __device__ RealType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ); }; __device__ RealType commonReductionOnDevice( const IndexType idx1, @@ -160,33 +360,41 @@ class tnlParallelReductionSum }; template< typename Real, typename Index > -class tnlParallelReductionMin +class tnlParallelReductionAbsMin { public: typedef Real RealType; typedef Index IndexType; + RealType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const + { + return tnlAbs( data1[ idx ] ); + }; - RealType reduceOnHost( const RealType& data1, - const RealType& data2 ) const + RealType reduceOnHost( const IndexType idx, + const RealType& current, + const RealType* data1, + const RealType* data2 ) const { - return Min( data1, data2 ); - } + return Min( current, tnlAbs( data1[ idx ] ) ); + }; __device__ RealType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2 ) const { - return tnlCudaMin( data1[ idx1 ], data1[ idx2 ] ); + return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) ); } __device__ RealType initialValueOnDevice( const IndexType idx1, const RealType* data1, const RealType* data2 ) const { - return data1[ idx1 ]; + return tnlCudaAbs( data1[ idx1 ] ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, @@ -196,7 +404,7 @@ class tnlParallelReductionMin const RealType* data2, const RealType* data3 ) const { - return tnlCudaMin( data1[ idx1 ], tnlCudaMin( data2[ idx2 ], data2[ idx3 ] ) ); + return tnlCudaMin( data1[ idx1 ], tnlCudaMin( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, @@ -205,18 +413,81 @@ class tnlParallelReductionMin const RealType* data2, const RealType* data3 ) const { - return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] ); + return tnlCudaMin( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) ); }; __device__ RealType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data ) const { - return tnlCudaMin( data[ idx1 ], data[ idx2 ] ); + return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); }; }; +template< typename Real, typename Index > +class tnlParallelReductionAbsMax +{ + public: + + typedef Real RealType; + typedef Index IndexType; + + RealType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const + { + return tnlAbs( data1[ idx ] ); + }; + + RealType reduceOnHost( const IndexType idx, + const RealType& current, + const RealType* data1, + const RealType* data2 ) const + { + return Max( current, tnlAbs( data1[ idx ] ) ); + }; + __device__ RealType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const + { + return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) ); + } + + __device__ RealType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const + { + return tnlCudaAbs( data1[ idx1 ] ); + }; + + __device__ RealType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const RealType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return tnlCudaMax( data1[ idx1 ], tnlCudaMax( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) ); + }; + + __device__ RealType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return tnlCudaMax( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) ); + }; + + __device__ RealType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data ) const + { + return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); + }; +}; #include <implementation/core/cuda/reduction-operations_impl.h> diff --git a/src/implementation/core/CMakeLists.txt b/src/implementation/core/CMakeLists.txt index 10e9934615..d137f42ade 100755 --- a/src/implementation/core/CMakeLists.txt +++ b/src/implementation/core/CMakeLists.txt @@ -2,7 +2,7 @@ ADD_SUBDIRECTORY( cuda ) SET( headers cuda-long-vector-kernels.h vector-operations.h - memory-functions.h + memory-operations.h tnlArray_impl.h tnlHost_impl.h tnlLogger_impl.h diff --git a/src/implementation/core/cuda-long-vector-kernels.h b/src/implementation/core/cuda-long-vector-kernels.h index c9f49847ff..41c2cc8039 100644 --- a/src/implementation/core/cuda-long-vector-kernels.h +++ b/src/implementation/core/cuda-long-vector-kernels.h @@ -52,13 +52,7 @@ __device__ void reduceAligned( const Operation& operation, if( tid < s ) { sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata ); - /*if( operation == tnlParallelReductionMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); - if( operation == tnlParallelReductionMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); - if( operation == tnlParallelReductionSum ) - sdata[ tid ] += sdata[ tid + s ]; - if( operation == tnlParallelReductionAbsMin ) + /*if( operation == tnlParallelReductionAbsMin ) sdata[ tid ] = tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); if( operation == tnlParallelReductionAbsMax ) sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); @@ -84,13 +78,7 @@ __device__ void reduceNonAligned( const Operation& operation, if( tid < s ) { sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata ); - /*if( operation == tnlParallelReductionMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); - if( operation == tnlParallelReductionMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); - if( operation == tnlParallelReductionSum ) - sdata[ tid ] += sdata[ tid + s ]; - if( operation == tnlParallelReductionAbsMin ) + /*if( operation == tnlParallelReductionAbsMin ) sdata[ tid ] = tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); if( operation == tnlParallelReductionAbsMax ) sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); @@ -106,13 +94,7 @@ __device__ void reduceNonAligned( const Operation& operation, if( 2 * s < n && tid == n - 1 ) { sdata[ 0 ] = operation. commonReductionOnDevice( 0, tid, sdata ); - /*if( operation == tnlParallelReductionMin ) - sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); - if( operation == tnlParallelReductionMax ) - sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); - if( operation == tnlParallelReductionSum ) - sdata[ 0 ] += sdata[ tid ]; - if( operation == tnlParallelReductionAbsMin ) + /*if( operation == tnlParallelReductionAbsMin ) sdata[ 0 ] = tnlCudaMin( tnlCudaAbs( sdata[ 0] ), tnlCudaAbs( sdata[ tid + s ] ) ); if( operation == tnlParallelReductionAbsMax ) sdata[ 0 ] = tnlCudaMax( tnlCudaAbs( sdata[ 0 ] ), tnlCudaAbs( sdata[ tid + s ] ) ); @@ -357,7 +339,6 @@ __global__ void tnlCUDAReductionKernel( const Operation operation, reduceNonAligned( operation, tid, s, n, sdata ); n = s; } - } /*** @@ -367,275 +348,160 @@ __global__ void tnlCUDAReductionKernel( const Operation operation, deviceOutput[ blockIdx. x ] = sdata[ 0 ]; } -#endif -/*** - * The template calling the final CUDA kernel for the single vector reduction. - * The template parameters are: - * @param T is the type of data we want to reduce - * @param operation is the operation reducing the data. - * It can be tnlParallelReductionSum, tnlParallelReductionMin or tnlParallelReductionMax. - * The function parameters: - * @param size tells number of elements in the data array. - * @param deviceInput1 is the pointer to an array storing the data we want - * to reduce. This array must stay on the device!. - * @param deviceInput2 is the pointer to an array storing the coupling data for example - * the second vector for the SDOT operation. This array must stay on the device!. - * @param result will contain the result of the reduction if everything was ok - * and the return code is true. - * @param parameter can be used for example for the passing the parameter p of Lp norm. - * @param deviceAux is auxiliary array used to store temporary data during the reduction. - * If one calls this function more then once one might provide this array to avoid repetetive - * allocation of this array on the device inside of this function. - * The size of this array should be size / 128 * sizeof( T ). - */ template< typename Operation > -bool tnlCUDALongVectorReduction( const Operation& operation, - const typename Operation :: IndexType size, - const typename Operation :: RealType* deviceInput1, - const typename Operation :: RealType* deviceInput2, - typename Operation :: RealType& result, - typename Operation :: RealType* deviceAux = 0 ) +typename Operation :: IndexType reduceOnCudaDevice( const Operation& operation, + const typename Operation :: IndexType size, + const typename Operation :: RealType* input1, + const typename Operation :: RealType* input2, + typename Operation :: RealType*& output) { -#ifdef HAVE_CUDA - typedef typename Operation :: IndexType IndexType; typedef typename Operation :: RealType RealType; - /*** - * Set parameters: - * @param desBlockSize is desired block size with which we get the best performance (on CUDA rach 1.0 to 1.3) - * @param desGridSize is desired grid size - */ + const int desBlockSize = 512; const int desGridSize = 2048; - - RealType* dbg_array1; // debuging array + dim3 blockSize( 0 ), gridSize( 0 ); /*** - * Allocating auxiliary device memory to store temporary reduced arrays. - * For example in the first iteration we reduce the number of elements - * from size to size / 2. We store this new data in deviceAux array. - * If one calls the CUDA reduction more then once then one can provide - * auxiliary array by passing it via the parameter deviceAux. + * Compute the CUDA block size aligned to the power of two. */ - RealType* myDeviceAux( 0 ); - if( ! deviceAux ) - { - if( ! allocateMemoryCuda( myDeviceAux, :: Max( 1, size / desBlockSize ) ) ) - { - checkCudaDevice; + blockSize. x = :: Min( size, desBlockSize ); + IndexType alignedBlockSize = 1; + while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1; + blockSize. x = alignedBlockSize; + + gridSize. x = Min( ( IndexType ) ( size / blockSize. x + 1 ) / 2, desGridSize ); + + if( ! output && + ! allocateMemoryCuda( output, :: Max( 1, size / desBlockSize ) ) ) return false; - } - deviceAux = myDeviceAux; - } + IndexType shmem = blockSize. x * sizeof( RealType ); /*** - * Setup parameters of the kernel: - * @param sizeReduced is the size of reduced data after each step of parallel reduction - * @param reductionInput tells what data we shell reduce. We start with the input if this fuction - * and after the 1st reduction step we switch this pointer to deviceAux. + * Depending on the blockSize we generate appropriate template instance. */ - IndexType sizeReduced = size; - const RealType* reductionInput1 = deviceInput1; - const RealType* reductionInput2 = deviceInput2; - IndexType reductionSteps( 0 ); - while( sizeReduced > maxGPUReductionDataSize ) - { - dim3 blockSize( 0 ), gridSize( 0 ); - blockSize. x = :: Min( sizeReduced, desBlockSize ); - gridSize. x = :: Min( ( IndexType ) ( sizeReduced / blockSize. x + 1 ) / 2, desGridSize ); - - /*** - * We align the blockSize to the power of 2. - */ - IndexType alignedBlockSize = 1; - while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1; - blockSize. x = alignedBlockSize; - IndexType shmem = blockSize. x * sizeof( RealType ); - /*** - * Depending on the blockSize we generate appropriate template instance. - */ -#ifdef UNDEF - if( reductionSteps > 0 && - ( operation == tnlParallelReductionSdot || - operation == tnlParallelReductionLpNorm ) ) + switch( blockSize. x ) { - /*** - * For operations like SDOT or LpNorm we need to switch to tnlParallelReductionSum after the - * first reduction step. - */ - switch( blockSize. x ) - { - case 512: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 512 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 256: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 256 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 128: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 128 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 64: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 64 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 32: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 32 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 16: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 16 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 8: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 8 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 4: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 4 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 2: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 2 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 1: - tnlAssert( false, cerr << "blockSize should not be 1." << endl ); - break; - default: - tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - break; - } + case 512: + tnlCUDAReductionKernel< Operation, 512 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 256: + tnlCUDAReductionKernel< Operation, 256 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 128: + tnlCUDAReductionKernel< Operation, 128 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 64: + tnlCUDAReductionKernel< Operation, 64 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 32: + tnlCUDAReductionKernel< Operation, 32 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 16: + tnlCUDAReductionKernel< Operation, 16 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 8: + tnlCUDAReductionKernel< Operation, 8 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 4: + tnlCUDAReductionKernel< Operation, 4 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 2: + tnlCUDAReductionKernel< Operation, 2 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 1: + tnlAssert( false, cerr << "blockSize should not be 1." << endl ); + break; + default: + tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + break; } - else + return gridSize. x; +} #endif - switch( blockSize. x ) - { - case 512: - tnlCUDAReductionKernel< Operation, 512 > - <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); - break; - case 256: - tnlCUDAReductionKernel< Operation, 256 > - <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); - break; - case 128: - tnlCUDAReductionKernel< Operation, 128 > - <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); - break; - case 64: - tnlCUDAReductionKernel< Operation, 64 > - <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); - break; - case 32: - tnlCUDAReductionKernel< Operation, 32 > - <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); - break; - case 16: - tnlCUDAReductionKernel< Operation, 16 > - <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); - break; - case 8: - tnlCUDAReductionKernel< Operation, 8 > - <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); - break; - case 4: - tnlCUDAReductionKernel< Operation, 4 > - <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); - break; - case 2: - tnlCUDAReductionKernel< Operation, 2 > - <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); - break; - case 1: - tnlAssert( false, cerr << "blockSize should not be 1." << endl ); - break; - default: - tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - break; - } - sizeReduced = gridSize. x; - reductionInput1 = deviceAux; - reductionSteps ++; - } - /*** - * We transfer reduced data from device to host. - * If sizeReduced equals size the previous loop was not processed and we read - * data directly from the input. +template< typename Operation > +bool tnlCUDALongVectorReduction( const Operation& operation, + const typename Operation :: IndexType size, + const typename Operation :: RealType* deviceInput1, + const typename Operation :: RealType* deviceInput2, + typename Operation :: RealType& result ) +{ +#ifdef HAVE_CUDA + + typedef typename Operation :: IndexType IndexType; + typedef typename Operation :: RealType RealType; + + /**** + * First check if the input array(s) is/are large enough for the reduction on GPU. + * Otherwise copy it/them to host and reduce on CPU. */ - RealType result_array[ maxGPUReductionDataSize ]; - RealType result_array2[ maxGPUReductionDataSize ]; - if( sizeReduced == size ) + RealType hostArray1[ maxGPUReductionDataSize ]; + RealType hostArray2[ maxGPUReductionDataSize ]; + if( size <= maxGPUReductionDataSize ) { - if( ! copyMemoryCudaToHost( result_array, deviceInput1, sizeReduced ) ) + if( ! copyMemoryCudaToHost( hostArray1, deviceInput1, size ) ) return false; -#ifdef UNDEF - switch( operation ) - { - case tnlParallelReductionLpNorm: - result = pow( tnlAbs( result_array[ 0 ] ), parameter ); - for( Index i = 1; i < sizeReduced; i ++ ) - result += pow( tnlAbs( result_array[ i ] ), parameter ); - result = pow( result, 1.0/ parameter ); - return true; - case tnlParallelReductionSdot: - if( ! copyMemoryCudaToHost( result_array2, deviceInput2, sizeReduced ) ) - return false; - else - { - result = 0; - for( Index i = 0; i < sizeReduced; i ++ ) - result += result_array[ i ] * result_array2[ i ] ; - return true; - } - } -#endif - } - else - if( ! copyMemoryCudaToHost( result_array, deviceAux, sizeReduced ) ) + if( deviceInput2 && ! copyMemoryCudaToHost( hostArray2, deviceInput2, size ) ) return false; - result = result_array[ 0 ]; - for( IndexType i = 1; i < sizeReduced; i ++ ) - result = operation. reduceOnHost( result, result_array[ i ] ); - /*switch( operation ) - { - case tnlParallelReductionMax: - result = result_array[ 0 ]; - for( Index i = 1; i < sizeReduced; i ++ ) - result = Max( result, result_array[ i ] ); - break; - case tnlParallelReductionMin: - result = result_array[ 0 ]; - for( Index i = 1; i < sizeReduced; i ++ ) - result = Min( result, result_array[ i ] ); - break; - case tnlParallelReductionSum: - case tnlParallelReductionLpNorm: - case tnlParallelReductionSdot: - result = result_array[ 0 ]; - for( Index i = 1; i < sizeReduced; i ++ ) - result += result_array[ i ]; - break; - case tnlParallelReductionAbsMax: - result = tnlAbs( result_array[ 0 ] ); - for( Index i = 1; i < sizeReduced; i ++ ) - result = Max( result, tnlAbs( result_array[ i ] ) ); - break; - case tnlParallelReductionAbsMin: - result = tnlAbs( result_array[ 0 ] ); - for( Index i = 1; i < sizeReduced; i ++ ) - result = Min( result, tnlAbs( result_array[ i ] ) ); - break; - }*/ - if( myDeviceAux ) + result = operation. initialValueOnHost( 0, hostArray1, hostArray2 ); + for( IndexType i = 1; i < size; i ++ ) + result = operation. reduceOnHost( i, result, hostArray1, hostArray2 ); + return true; + } + + /**** + * Reduce the data on the CUDA device. + */ + RealType* deviceAux1( 0 ), *deviceAux2( 0 ); + IndexType reducedSize = reduceOnCudaDevice( operation, + size, + deviceInput1, + deviceInput2, + deviceAux1 ); + + while( reducedSize > maxGPUReductionDataSize ) { - freeMemoryCuda( myDeviceAux ); - if( ! checkCudaDevice ) - return false; + reducedSize = reduceOnCudaDevice( operation, + reducedSize, + deviceAux1, + ( RealType* ) 0, + deviceAux2 ); + Swap( deviceAux1, deviceAux2 ); } + + /*** + * Transfer the reduced data from device to host. + */ + RealType resultArray[ maxGPUReductionDataSize ]; + if( ! copyMemoryCudaToHost( resultArray, deviceAux1, reducedSize ) ) + return false; + + /*** + * Reduce the data on the host system. + */ + result = operation. initialValueOnHost( 0, resultArray, ( RealType* ) 0 ); + for( IndexType i = 1; i < reducedSize; i ++ ) + result = operation. reduceOnHost( i, result, resultArray, ( RealType*) 0 ); + + /**** + * Free the memory allocated on the device. + */ + if( deviceAux1 && ! freeMemoryCuda( deviceAux1 ) ) + return false; + if( deviceAux2 && ! freeMemoryCuda( deviceAux2 ) ) + return false; + + return true; #else cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl; @@ -643,52 +509,34 @@ bool tnlCUDALongVectorReduction( const Operation& operation, #endif }; -#ifdef HAVE_CUDA -/*** - * This kernel just compares two vectors element by element. It writes - * the result of the comparison into array result. This array must be - * then reduced. - */ -template< typename Real, typename Index > -__global__ void compareTwoVectorsElementwise( const Index size, - const Real* vector1, - const Real* vector2, - bool* result ) -{ - Index gid = blockDim. x * blockIdx. x + threadIdx. x; - if( gid < size ) - { - if( vector1[ gid ] == vector2[ gid ] ) - result[ gid ] = true; - else - result[ gid ] = false; - } -} -#endif -/*** - * The template for comparison of two long vectors on the CUDA device. - * The template parameters are: - * @param T is the type of data we want to reduce - * @param operation is the operation reducing the data. - * It can be tnlParallelReductionSum, tnlParallelReductionMin or tnlParallelReductionMax. - * The function parameters: - * @param size tells number of elements in the data array. - * @param deviceInput1 is the pointer to an array storing the data we want - * to reduce. This array must stay on the device!. - * @param deviceInput2 is the pointer to an array storing the coupling data for example - * the second vector for the SDOT operation. This array most stay on the device!. - * @param result will contain the result of the reduction if everything was ok - * and the return code is true. - * @param deviceAux is auxiliary array used to store temporary data during the reduction. - * If one calls this function more then once one might provide this array to avoid repetetive - * allocation of this array on the device inside of this function. - * The size of this array should be size / 128 * sizeof( T ). - * - * This function first calls kernel which compares each couples of elements from both vectors. - * Result is written into a bool array. The minimum value then says if both vectors equal. - * - */ + + + + + + + + + + + + + + + + + + + + + + + + + + + // TODO: result of comparison should not be returned!!! template< typename Type, typename Index > bool tnlCUDALongVectorComparison( const Index size, @@ -714,10 +562,10 @@ bool tnlCUDALongVectorComparison( const Index size, blockSize. x = 256; gridSize. x = size / blockSize. x + 1; - compareTwoVectorsElementwise<<< gridSize, blockSize >>>( size, - deviceInput1, - deviceInput2, - deviceBoolAux ); + //compareTwoVectorsElementwise<<< gridSize, blockSize >>>( size, + // deviceInput1, + // deviceInput2, + // deviceBoolAux ); if( ! checkCudaDevice ) return false; bool result; diff --git a/src/implementation/core/cuda/CMakeLists.txt b/src/implementation/core/cuda/CMakeLists.txt index 5e9be45d30..6d119a124f 100755 --- a/src/implementation/core/cuda/CMakeLists.txt +++ b/src/implementation/core/cuda/CMakeLists.txt @@ -13,5 +13,5 @@ set( tnl_implementation_core_cuda_SOURCES ${CURRENT_DIR}/device-check.cpp PARENT_SCOPE ) -INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/implementation ) +INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/implementation/core/cuda ) diff --git a/src/implementation/core/memory-operations.h b/src/implementation/core/memory-operations.h index 6a645b7560..759ca067c4 100644 --- a/src/implementation/core/memory-operations.h +++ b/src/implementation/core/memory-operations.h @@ -129,15 +129,16 @@ bool copyMemoryHostToCuda( Element* destination, const Index size ) { #ifdef HAVE_CUDA - if( cudaMemcpy( destination, - source, - size * sizeof( Element ), - cudaMemcpyHostToDevice ) != cudaSuccess ) + cudaMemcpy( destination, + source, + size * sizeof( Element ), + cudaMemcpyHostToDevice ); + if( ! checkCudaDevice ) { cerr << "Transfer of data from host to CUDA device failed." << endl; return false; } - return checkCudaDevice; + return true; #else cerr << "CUDA support is missing in this system." << endl; return false; @@ -155,7 +156,12 @@ bool copyMemoryCudaToHost( Element* destination, source, size * sizeof( Element ), cudaMemcpyDeviceToHost ); - return checkCudaDevice; + if( ! checkCudaDevice ) + { + cerr << "Transfer of data from CUDA device to host failed." << endl; + return false; + } + return true; #else cerr << "CUDA support is missing in this system." << endl; return false; diff --git a/src/legacy/core/CMakeLists.txt b/src/legacy/core/CMakeLists.txt index 3a8d36c743..247d104fad 100755 --- a/src/legacy/core/CMakeLists.txt +++ b/src/legacy/core/CMakeLists.txt @@ -1,13 +1,4 @@ -SET( headers tnlBICGSolver.h - tnlBICGStabSolver.h - tnlCGSolver.h - tnlGMRESSolverOld.h - tnlILUPreconditioner.h - tnlMatrixSolver.h - tnlPETSCPreconditioner.h - tnlPETSCSolver.h - tnlPreconditioner.h - tnlSORSolver.h +SET( headers tnlCudaSupport.h ) -INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/legacy/solvers ) +INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/legacy/core ) diff --git a/tests/unit-tests/core/cuda/tnlCudaReductionTester.h b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h index 8a0519885c..ee8dbd83a5 100644 --- a/tests/unit-tests/core/cuda/tnlCudaReductionTester.h +++ b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h @@ -41,10 +41,16 @@ class tnlCudaReductionTester : public CppUnit :: TestCase CppUnit :: TestResult result; suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( - "constantSequenceTest", - &tnlCudaReductionTester :: constantSequenceTest< float > ) + "shortConstantSequenceTest", + &tnlCudaReductionTester :: shortConstantSequenceTest< float > ) ); + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( + "longConstantSequenceTest", + &tnlCudaReductionTester :: longConstantSequenceTest< float > ) + ); + + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( "linearSequenceTest", &tnlCudaReductionTester :: linearSequenceTest< float > ) @@ -55,28 +61,106 @@ class tnlCudaReductionTester : public CppUnit :: TestCase } template< typename RealType > - void constantSequenceTest() + void setConstantSequence( const int size, + const RealType& value, + RealType*& hostData, + RealType*& deviceData ) { - const int size( 1024 ); - RealType *hostData, *deviceData; - allocateMemoryHost( hostData, size ); - allocateMemoryCuda( deviceData, size ); - CPPUNIT_ASSERT( checkCudaDevice ); for( int i = 0; i < size; i ++ ) - hostData[ i ] = 1; + hostData[ i ] = value; copyMemoryHostToCuda( deviceData, hostData, size ); CPPUNIT_ASSERT( checkCudaDevice ); + } + + template< typename RealType > + void shortConstantSequenceTest() + { + const int shortSequence( 128 ); + const int longSequence( 8192 ); + RealType *hostData, *deviceData; + allocateMemoryHost( hostData, shortSequence ); + allocateMemoryCuda( deviceData, shortSequence ); + CPPUNIT_ASSERT( checkCudaDevice ); + + setConstantSequence( shortSequence, ( RealType ) -1, hostData, deviceData ); + RealType result; + + tnlParallelReductionSum< RealType, int > sumOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( sumOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == -shortSequence ); + + tnlParallelReductionMin< RealType, int > minOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( minOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == -1 ); + + tnlParallelReductionMax< RealType, int > maxOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( maxOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == -1 ); + + tnlParallelReductionAbsSum< RealType, int > absSumOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( absSumOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == shortSequence ); + + tnlParallelReductionAbsMin< RealType, int > absMinOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( absMinOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 1 ); + + tnlParallelReductionAbsMax< RealType, int > absMaxOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( absMaxOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 1 ); + + freeMemoryHost( hostData ); + freeMemoryCuda( deviceData ); + CPPUNIT_ASSERT( checkCudaDevice ); + }; + + template< typename RealType > + void longConstantSequenceTest() + { + const int longSequence( 8192 ); + RealType *hostData, *deviceData; + allocateMemoryHost( hostData, longSequence ); + allocateMemoryCuda( deviceData, longSequence ); + CPPUNIT_ASSERT( checkCudaDevice ); + + setConstantSequence( longSequence, ( RealType ) -1, hostData, deviceData ); RealType result; + tnlParallelReductionSum< RealType, int > sumOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( sumOperation, size, deviceData, ( RealType* ) 0, result ) ) ); - CPPUNIT_ASSERT( result == size ); + ( tnlCUDALongVectorReduction( sumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == -longSequence ); tnlParallelReductionMin< RealType, int > minOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( minOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + ( tnlCUDALongVectorReduction( minOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == -1 ); + + tnlParallelReductionMax< RealType, int > maxOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( maxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == -1 ); + + tnlParallelReductionAbsSum< RealType, int > absSumOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( absSumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == longSequence ); + + tnlParallelReductionAbsMin< RealType, int > absMinOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( absMinOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == 1 ); + tnlParallelReductionAbsMax< RealType, int > absMaxOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( absMaxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 1 ); freeMemoryHost( hostData ); freeMemoryCuda( deviceData ); @@ -95,7 +179,7 @@ class tnlCudaReductionTester : public CppUnit :: TestCase RealType sum( 0.0 ); for( int i = 0; i < size; i ++ ) { - hostData[ i ] = i + 1; + hostData[ i ] = -i - 1; sum += hostData[ i ]; } copyMemoryHostToCuda( deviceData, hostData, size ); @@ -108,8 +192,27 @@ class tnlCudaReductionTester : public CppUnit :: TestCase tnlParallelReductionMin< RealType, int > minOperation; CPPUNIT_ASSERT( ( tnlCUDALongVectorReduction( minOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == -size ); + + tnlParallelReductionMax< RealType, int > maxOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( maxOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == -1 ); + + tnlParallelReductionAbsSum< RealType, int > absSumOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( absSumOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == tnlAbs( sum ) ); + + tnlParallelReductionAbsMin< RealType, int > absMinOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( absMinOperation, size, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == 1 ); + tnlParallelReductionAbsMax< RealType, int > absMaxOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( absMaxOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == size ); freeMemoryHost( hostData ); freeMemoryCuda( deviceData ); -- GitLab