Commit ac2ee07e authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

CUDA prefix-sum: moved gridShift from the first phase to the second phase

parent 1fe62640
Loading
Loading
Loading
Loading
+11 −23
Original line number Original line Diff line number Diff line
@@ -34,8 +34,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
                              const Index elementsInBlock,
                              const Index elementsInBlock,
                              const Real* input,
                              const Real* input,
                              Real* output,
                              Real* output,
                              Real* auxArray,
                              Real* auxArray )
                              const Real gridShift )
{
{
   Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >();
   Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >();
   Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
   Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
@@ -67,8 +66,6 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
         idx += blockDim.x;
         idx += blockDim.x;
      }
      }
   }
   }
   if( blockIdx.x == 0 && threadIdx.x == 0 )
      sharedData[ 0 ] = reduction( sharedData[ 0 ], gridShift );


   /***
   /***
    * Perform the sequential prefix-sum.
    * Perform the sequential prefix-sum.
@@ -150,10 +147,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
   {
   {
      if( prefixSumType == PrefixSumType::Exclusive )
      if( prefixSumType == PrefixSumType::Exclusive )
      {
      {
         Real aux = zero;
         auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ],
         aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] );
                                             sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] );
         aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] );
         auxArray[ blockIdx.x ] = aux;
      }
      }
      else
      else
         auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ];
         auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ];
@@ -172,17 +167,15 @@ cudaSecondPhaseBlockPrefixSum( Reduction reduction,
                               Real* data )
                               Real* data )
{
{
   if( blockIdx.x > 0 )
   if( blockIdx.x > 0 )
   {
      gridShift = reduction( gridShift, auxArray[ blockIdx.x - 1 ] );
      const Real shift = auxArray[ blockIdx.x - 1 ];
   const Index readOffset = blockIdx.x * elementsInBlock;
   const Index readOffset = blockIdx.x * elementsInBlock;
   Index readIdx = threadIdx.x;
   Index readIdx = threadIdx.x;
   while( readIdx < elementsInBlock && readOffset + readIdx < size )
   while( readIdx < elementsInBlock && readOffset + readIdx < size )
   {
   {
         data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift );
      data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], gridShift );
      readIdx += blockDim.x;
      readIdx += blockDim.x;
   }
   }
}
}
}


template< PrefixSumType prefixSumType,
template< PrefixSumType prefixSumType,
          typename Real,
          typename Real,
@@ -229,8 +222,7 @@ struct CudaPrefixSumKernelLauncher
           elementsInBlock,
           elementsInBlock,
           input,
           input,
           output,
           output,
           auxArray1.getData(),
           auxArray1.getData() );
           gridShift );
      cudaStreamSynchronize(0);
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
      TNL_CHECK_CUDA_DEVICE;


@@ -264,12 +256,8 @@ struct CudaPrefixSumKernelLauncher
      cudaStreamSynchronize(0);
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
      TNL_CHECK_CUDA_DEVICE;


      cudaMemcpy( &gridShift,
      gridShift = auxArray2.getElement( auxArraySize - 1 );
                  &auxArray2[ auxArraySize - 1 ],
                  sizeof( Real ),
                  cudaMemcpyDeviceToHost );
      //std::cerr << "gridShift = " << gridShift << std::endl;
      //std::cerr << "gridShift = " << gridShift << std::endl;
      TNL_CHECK_CUDA_DEVICE;
   }
   }


   /****
   /****