Commit 27622109 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Merge branch 'master' into mhfem

* master:
  Tuning Ellpack formats.
  Tuning Ellpack format.
  Fixing bug in tnlVector::operator !=.
  Optimizing CUDA L2 norm.
  Adding banchmark for CUDA lp norm.
parents 5e526c7f 994ee56d
Loading
Loading
Loading
Loading
+8 −8
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@ template bool cudaPrefixSum( const int size,
                             const int blockSize,
                             const int *deviceInput,
                             int* deviceOutput,
                             const tnlParallelReductionSum< int, int >& operation,
                             tnlParallelReductionSum< int, int >& operation,
                             const enumPrefixSumType prefixSumType );


@@ -32,7 +32,7 @@ template bool cudaPrefixSum( const int size,
                             const int blockSize,
                             const float *deviceInput,
                             float* deviceOutput,
                             const tnlParallelReductionSum< float, int >& operation,
                             tnlParallelReductionSum< float, int >& operation,
                             const enumPrefixSumType prefixSumType );
#endif

@@ -40,7 +40,7 @@ template bool cudaPrefixSum( const int size,
                             const int blockSize,
                             const double *deviceInput,
                             double* deviceOutput,
                             const tnlParallelReductionSum< double, int >& operation,
                             tnlParallelReductionSum< double, int >& operation,
                             const enumPrefixSumType prefixSumType );

#ifdef INSTANTIATE_LONG_DOUBLE
@@ -48,7 +48,7 @@ template bool cudaPrefixSum( const int size,
                             const int blockSize,
                             const long double *deviceInput,
                             long double* deviceOutput,
                             const tnlParallelReductionSum< long double, int >& operation,
                             tnlParallelReductionSum< long double, int >& operation,
                             const enumPrefixSumType prefixSumType );
#endif

@@ -57,7 +57,7 @@ template bool cudaPrefixSum( const long int size,
                             const long int blockSize,
                             const int *deviceInput,
                             int* deviceOutput,
                             const tnlParallelReductionSum< int, long int >& operation,
                             tnlParallelReductionSum< int, long int >& operation,
                             const enumPrefixSumType prefixSumType );


@@ -66,7 +66,7 @@ template bool cudaPrefixSum( const long int size,
                             const long int blockSize,
                             const float *deviceInput,
                             float* deviceOutput,
                             const tnlParallelReductionSum< float, long int >& operation,
                             tnlParallelReductionSum< float, long int >& operation,
                             const enumPrefixSumType prefixSumType );
#endif

@@ -74,7 +74,7 @@ template bool cudaPrefixSum( const long int size,
                             const long int blockSize,
                             const double *deviceInput,
                             double* deviceOutput,
                             const tnlParallelReductionSum< double, long int >& operation,
                             tnlParallelReductionSum< double, long int >& operation,
                             const enumPrefixSumType prefixSumType );

#ifdef INSTANTIATE_LONG_DOUBLE
@@ -82,7 +82,7 @@ template bool cudaPrefixSum( const long int size,
                             const long int blockSize,
                             const long double *deviceInput,
                             long double* deviceOutput,
                             const tnlParallelReductionSum< long double, long int >& operation,
                             tnlParallelReductionSum< long double, long int >& operation,
                             const enumPrefixSumType prefixSumType );
#endif
#endif 
+13 −13
Original line number Diff line number Diff line
@@ -30,7 +30,7 @@ template< typename DataType,
          typename Operation,
          typename Index >
__global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumType,
                                              const Operation operation,
                                              Operation operation,
                                              const Index size,
                                              const Index elementsInBlock,
                                              const DataType* input,
@@ -159,7 +159,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
template< typename DataType,
          typename Operation,
          typename Index >
__global__ void cudaSecondPhaseBlockPrefixSum( const Operation operation,
__global__ void cudaSecondPhaseBlockPrefixSum( Operation operation,
                                               const Index size,
                                               const Index elementsInBlock,
                                               const Index gridShift,
@@ -186,7 +186,7 @@ template< typename DataType,
          typename Operation,
          typename Index >
bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType,
                             const Operation& operation,
                             Operation& operation,
                             const Index size,
                             const Index blockSize,
                             const Index elementsInBlock,
@@ -276,7 +276,7 @@ template< typename DataType,
          typename Operation,
          typename Index >
bool cudaGridPrefixSum( enumPrefixSumType prefixSumType,
                        const Operation& operation,
                        Operation& operation,
                        const Index size,
                        const Index blockSize,
                        const Index elementsInBlock,
@@ -313,7 +313,7 @@ bool cudaPrefixSum( const Index size,
                    const Index blockSize,
                    const DataType *deviceInput,
                    DataType* deviceOutput,
                    const Operation& operation,
                    Operation& operation,
                    const enumPrefixSumType prefixSumType )
{
   /****
@@ -361,7 +361,7 @@ extern template bool cudaPrefixSum( const int size,
                                    const int blockSize,
                                    const int *deviceInput,
                                    int* deviceOutput,
                                    const tnlParallelReductionSum< int, int >& operation,
                                    tnlParallelReductionSum< int, int >& operation,
                                    const enumPrefixSumType prefixSumType );


@@ -369,14 +369,14 @@ extern template bool cudaPrefixSum( const int size,
                                    const int blockSize,
                                    const float *deviceInput,
                                    float* deviceOutput,
                                    const tnlParallelReductionSum< float, int >& operation,
                                    tnlParallelReductionSum< float, int >& operation,
                                    const enumPrefixSumType prefixSumType );

extern template bool cudaPrefixSum( const int size,
                                    const int blockSize,
                                    const double *deviceInput,
                                    double* deviceOutput,
                                    const tnlParallelReductionSum< double, int >& operation,
                                    tnlParallelReductionSum< double, int >& operation,
                                    const enumPrefixSumType prefixSumType );

#ifdef INSTANTIATE_LONG_DOUBLE
@@ -384,7 +384,7 @@ extern template bool cudaPrefixSum( const int size,
                                    const int blockSize,
                                    const long double *deviceInput,
                                    long double* deviceOutput,
                                    const tnlParallelReductionSum< long double, int >& operation,
                                    tnlParallelReductionSum< long double, int >& operation,
                                    const enumPrefixSumType prefixSumType );
#endif

@@ -393,7 +393,7 @@ extern template bool cudaPrefixSum( const long int size,
                                    const long int blockSize,
                                    const int *deviceInput,
                                    int* deviceOutput,
                                    const tnlParallelReductionSum< int, long int >& operation,
                                    tnlParallelReductionSum< int, long int >& operation,
                                    const enumPrefixSumType prefixSumType );


@@ -401,14 +401,14 @@ extern template bool cudaPrefixSum( const long int size,
                                    const long int blockSize,
                                    const float *deviceInput,
                                    float* deviceOutput,
                                    const tnlParallelReductionSum< float, long int >& operation,
                                    tnlParallelReductionSum< float, long int >& operation,
                                    const enumPrefixSumType prefixSumType );

extern template bool cudaPrefixSum( const long int size,
                                    const long int blockSize,
                                    const double *deviceInput,
                                    double* deviceOutput,
                                    const tnlParallelReductionSum< double, long int >& operation,
                                    tnlParallelReductionSum< double, long int >& operation,
                                    const enumPrefixSumType prefixSumType );

#ifdef INSTANTIATE_LONG_DOUBLE
@@ -416,7 +416,7 @@ extern template bool cudaPrefixSum( const long int size,
                                    const long int blockSize,
                                    const long double *deviceInput,
                                    long double* deviceOutput,
                                    const tnlParallelReductionSum< long double, long int >& operation,
                                    tnlParallelReductionSum< long double, long int >& operation,
                                    const enumPrefixSumType prefixSumType );
#endif
#endif
+10 −10
Original line number Diff line number Diff line
@@ -25,28 +25,28 @@
 */

template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, int > >
                                   ( const tnlParallelReductionAbsMax< char, int >& operation,
                                   ( tnlParallelReductionAbsMax< char, int >& operation,
                                     const typename tnlParallelReductionAbsMax< char, int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput2,
                                     typename tnlParallelReductionAbsMax< char, int > :: ResultType& result );

template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, int > >
                                   ( const tnlParallelReductionAbsMax< int, int >& operation,
                                   ( tnlParallelReductionAbsMax< int, int >& operation,
                                     const typename tnlParallelReductionAbsMax< int, int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput2,
                                     typename tnlParallelReductionAbsMax< int, int > :: ResultType& result );

template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, int > >
                                   ( const tnlParallelReductionAbsMax< float, int >& operation,
                                   ( tnlParallelReductionAbsMax< float, int >& operation,
                                     const typename tnlParallelReductionAbsMax< float, int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput2,
                                     typename tnlParallelReductionAbsMax< float, int> :: ResultType& result );

template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > >
                                   ( const tnlParallelReductionAbsMax< double, int>& operation,
                                   ( tnlParallelReductionAbsMax< double, int>& operation,
                                     const typename tnlParallelReductionAbsMax< double, int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2,
@@ -54,7 +54,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > >

#ifdef INSTANTIATE_LONG_DOUBLE
template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
                                   ( const tnlParallelReductionAbsMax< long double, int>& operation,
                                   ( tnlParallelReductionAbsMax< long double, int>& operation,
                                     const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2,
@@ -62,7 +62,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, in
#endif                                     

template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > >
                                   ( const tnlParallelReductionAbsMax< char, long int >& operation,
                                   ( tnlParallelReductionAbsMax< char, long int >& operation,
                                     const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput2,
@@ -70,21 +70,21 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int

#ifdef INSTANTIATE_LONG_INT
template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, long int > >
                                   ( const tnlParallelReductionAbsMax< int, long int >& operation,
                                   ( tnlParallelReductionAbsMax< int, long int >& operation,
                                     const typename tnlParallelReductionAbsMax< int, long int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput2,
                                     typename tnlParallelReductionAbsMax< int, long int > :: ResultType& result );

template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, long int > >
                                   ( const tnlParallelReductionAbsMax< float, long int >& operation,
                                   ( tnlParallelReductionAbsMax< float, long int >& operation,
                                     const typename tnlParallelReductionAbsMax< float, long int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput2,
                                     typename tnlParallelReductionAbsMax< float, long int> :: ResultType& result );

template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long int > >
                                   ( const tnlParallelReductionAbsMax< double, long int>& operation,
                                   ( tnlParallelReductionAbsMax< double, long int>& operation,
                                     const typename tnlParallelReductionAbsMax< double, long int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2,
@@ -92,7 +92,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long in

#ifdef INSTANTIATE_LONG_DOUBLE
template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
                                   ( const tnlParallelReductionAbsMax< long double, long int>& operation,
                                   ( tnlParallelReductionAbsMax< long double, long int>& operation,
                                     const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2,
+10 −10
Original line number Diff line number Diff line
@@ -25,28 +25,28 @@
 */

template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, int > >
                                   ( const tnlParallelReductionAbsMin< char, int >& operation,
                                   ( tnlParallelReductionAbsMin< char, int >& operation,
                                     const typename tnlParallelReductionAbsMin< char, int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput2,
                                     typename tnlParallelReductionAbsMin< char, int > :: ResultType& result );

template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, int > >
                                   ( const tnlParallelReductionAbsMin< int, int >& operation,
                                   ( tnlParallelReductionAbsMin< int, int >& operation,
                                     const typename tnlParallelReductionAbsMin< int, int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput2,
                                     typename tnlParallelReductionAbsMin< int, int > :: ResultType& result );

template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, int > >
                                   ( const tnlParallelReductionAbsMin< float, int >& operation,
                                   ( tnlParallelReductionAbsMin< float, int >& operation,
                                     const typename tnlParallelReductionAbsMin< float, int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput2,
                                     typename tnlParallelReductionAbsMin< float, int> :: ResultType& result );

template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > >
                                   ( const tnlParallelReductionAbsMin< double, int>& operation,
                                   ( tnlParallelReductionAbsMin< double, int>& operation,
                                     const typename tnlParallelReductionAbsMin< double, int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2,
@@ -54,7 +54,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > >

#ifdef INSTANTIATE_LONG_DOUBLE
template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
                                   ( const tnlParallelReductionAbsMin< long double, int>& operation,
                                   ( tnlParallelReductionAbsMin< long double, int>& operation,
                                     const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2,
@@ -62,7 +62,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, in
#endif

template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > >
                                   ( const tnlParallelReductionAbsMin< char, long int >& operation,
                                   ( tnlParallelReductionAbsMin< char, long int >& operation,
                                     const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput2,
@@ -70,21 +70,21 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int

#ifdef INSTANTIATE_LONG_INT
template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, long int > >
                                   ( const tnlParallelReductionAbsMin< int, long int >& operation,
                                   ( tnlParallelReductionAbsMin< int, long int >& operation,
                                     const typename tnlParallelReductionAbsMin< int, long int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput2,
                                     typename tnlParallelReductionAbsMin< int, long int > :: ResultType& result );

template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, long int > >
                                   ( const tnlParallelReductionAbsMin< float, long int >& operation,
                                   ( tnlParallelReductionAbsMin< float, long int >& operation,
                                     const typename tnlParallelReductionAbsMin< float, long int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput2,
                                     typename tnlParallelReductionAbsMin< float, long int> :: ResultType& result );

template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long int > >
                                   ( const tnlParallelReductionAbsMin< double, long int>& operation,
                                   ( tnlParallelReductionAbsMin< double, long int>& operation,
                                     const typename tnlParallelReductionAbsMin< double, long int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2,
@@ -92,7 +92,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long in

#ifdef INSTANTIATE_LONG_DOUBLE
template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
                                   ( const tnlParallelReductionAbsMin< long double, long int>& operation,
                                   ( tnlParallelReductionAbsMin< long double, long int>& operation,
                                     const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size,
                                     const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1,
                                     const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2,
+10 −10

File changed.

Preview size limit exceeded, changes collapsed.

Loading