Commit 87a42926 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Optimizing parallel reduction in CUDA.

parent 3aa06ead
Loading
Loading
Loading
Loading
+4 −3
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ ADD_SUBDIRECTORY( cuda )
ADD_SUBDIRECTORY( vectors )

set (headers tnlAssert.h               
             tnlConstants.h
             tnlCurve.h
      	     tnlCuda.h
             tnlDataElement.h
+2 −2
Original line number Diff line number Diff line
@@ -53,7 +53,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   if( prefixSumType == exclusivePrefixSum )
   {
      if( idx == 0 )
         sharedData[ 0 ] = operation.identity();
         sharedData[ 0 ] = operation.initialValue();
      while( idx < elementsInBlock && blockOffset + idx < size )
      {
         sharedData[ tnlCuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
@@ -129,7 +129,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   while( idx < elementsInBlock && blockOffset + idx < size )
   {
      const Index chunkIdx = idx / chunkSize;
      DataType chunkShift( operation.identity() );
      DataType chunkShift( operation.initialValue() );
      if( chunkIdx > 0 )
         chunkShift = auxData[ chunkIdx - 1 ];
      operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
+261 −254

File changed.

Preview size limit exceeded, changes collapsed.

+448 −4

File changed.

Preview size limit exceeded, changes collapsed.

+11 −0
Original line number Diff line number Diff line
@@ -96,6 +96,17 @@ inline int roundToMultiple( int number, int multiple )
   return multiple*( number/ multiple + ( number % multiple != 0 ) );
}

__cuda_callable__
inline bool isPow2( int x )
{
   return ( x & ( x - 1 ) == 0 );
}

__cuda_callable__
inline bool isPow2( long int x )
{
   return ( x & ( x - 1 ) == 0 );
}

/*template< typename T >
void swap( T& a, T& b)
Loading