Commit ac305460 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Fixed passing of Arrays by ArrayView.

parent 008601ad
Loading
Loading
Loading
Loading
+5 −5
Original line number Diff line number Diff line
@@ -353,8 +353,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,


template < typename Index >
__global__ void GetNeighbours( const TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator,
        TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY )
__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY )
{
  int i = blockIdx.x * 1024 + threadIdx.x;
  
@@ -389,7 +389,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
        TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator,
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
        const Containers::StaticVector< 2, Index > vecLowerOverlaps, 
        const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock )
{
@@ -598,7 +598,7 @@ tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
updateBlocks( InterfaceMapType interfaceMap,
        MeshFunctionType aux,
        MeshFunctionType helpFunc,
        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
        ArrayContainerView BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
{
#pragma omp parallel for schedule( dynamic )
  for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
@@ -769,7 +769,7 @@ template< typename Real,
        typename Index >
void 
tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY )
{
  int* BlockIterPom; 
  BlockIterPom = new int [numBlockX * numBlockY];
+4 −4
Original line number Diff line number Diff line
@@ -480,8 +480,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3


template < typename Index >
__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,
__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
        int numBlockX, int numBlockY, int numBlockZ )
{
  int i = blockIdx.x * 1024 + threadIdx.x;
@@ -520,7 +520,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
        Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps )
{
  int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
@@ -1056,7 +1056,7 @@ template< typename Real,
        typename Index >
void 
tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
{
  int* BlockIterPom; 
  BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ];
+22 −19
Original line number Diff line number Diff line
@@ -62,6 +62,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
    typedef Functions::MeshFunction< MeshType > MeshFunctionType;
    typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
    typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
    using ArrayContainerView = typename ArrayContainer::ViewType;
    typedef Containers::StaticVector< 2, Index > StaticVector;
    
    using MeshPointer = Pointers::SharedPointer<  MeshType >;
@@ -87,15 +88,18 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
            const RealType velocity = 1.0 );
        
// FOR OPENMP WILL BE REMOVED
    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
    void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY  );
        
    template< int sizeSArray >
    void updateBlocks( const InterfaceMapType& interfaceMap,
            MeshFunctionType& aux,
            MeshFunctionType& helpFunc,
            ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
    void updateBlocks( InterfaceMapType interfaceMap,
            MeshFunctionType aux,
            MeshFunctionType helpFunc,
            ArrayContainerView BlockIterHost, int numThreadsPerBlock );
    
  protected:
    
    void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY  );
   __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[],
           const RealType originalValue, const RealType v );
};

template< typename Real,
@@ -111,6 +115,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
    typedef Functions::MeshFunction< MeshType > MeshFunctionType;
    typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
    typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
    using ArrayContainerView = typename ArrayContainer::ViewType;
    typedef Containers::StaticVector< 3, Index > StaticVector;
    using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
    using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
@@ -134,15 +139,15 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
            const RealType velocity = 1.0 );
    
    // OPENMP WILL BE REMOVED
    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
    void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
    
    template< int sizeSArray >
    void updateBlocks( const InterfaceMapType& interfaceMap,
            const MeshFunctionType& aux,
    void updateBlocks( const InterfaceMapType interfaceMap,
            const MeshFunctionType aux,
            MeshFunctionType& helpFunc,
            ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
            ArrayContainer BlockIterHost, int numThreadsPerBlock );
    
    void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
  protected:
    
    __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[],
           const RealType originalValue, const RealType v );
@@ -180,17 +185,14 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
        TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator,
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
        const Containers::StaticVector< 2, Index > vecLowerOverlaps, 
        const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock =0);

template < typename Index >
__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks );
__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY );

template < typename Index >
__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );


// 3D
@@ -205,10 +207,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice );
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
        Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps );

template < typename Index >
__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
        int numBlockX, int numBlockY, int numBlockZ );
#endif
+44 −241
Original line number Diff line number Diff line
@@ -80,40 +80,9 @@ solve( const MeshPointer& mesh,
  IndexType iteration( 0 );
  InterfaceMapType interfaceMap = *interfaceMapPtr;
  MeshFunctionType aux = *auxPtr;
  aux.template synchronize< Communicator >();
  
  
#ifdef HAVE_MPI
  int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup );
  //printf( "Hello world from rank: %d ", i );
  //Communicators::MpiCommunicator::Request r = Communicators::MpiCommunicator::ISend( auxPtr, 0, 0, Communicators::MpiCommunicator::AllGroup );
  if( i == 1 ) {
    /*for( int k = 0; k < 16*16; k++ )
      aux[ k ] = 10;*/
    printf( "1: mesh x: %d\n", mesh->getDimensions().x() );
    printf( "1: mesh y: %d\n", mesh->getDimensions().y() );
    //aux.save("aux_proc1.tnl");
  }
  if( i == 0 ) {
    printf( "0: mesh x: %d\n", mesh->getDimensions().x() );
    printf( "0: mesh y: %d\n", mesh->getDimensions().y() );
    //aux.save("aux_proc0.tnl");
    /*for( int k = 0; k < mesh->getDimensions().x()*mesh->getDimensions().y(); k++ )
      aux[ k ] = 10;
    for( int k = 0; k < mesh->getDimensions().x(); k++ ){
      for( int l = 0; l < mesh->getDimensions().y(); l++ )
        printf("%f.2\t",aux[ k * 16 + l ] );
    printf("\n");
    }*/
  }
    
  /*bool a = Communicators::MpiCommunicator::IsInitialized();
  if( a )
    printf("Je Init\n");
  else
    printf("Neni Init\n");*/
#endif
  aux.template synchronize< Communicator >(); //synchronize initialized overlaps
  
  std::cout << "Calculating the values ..." << std::endl; 
  while( iteration < this->maxIterations )
  {
    // calculatedBefore indicates weather we calculated in the last passage of the while cycle 
@@ -290,41 +259,8 @@ solve( const MeshPointer& mesh,
        // Need for calling functions from kernel
        BaseType ptr;
        
        int BlockIterD = 1;
        /*auxPtr = helpFunc;
         
         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
         interfaceMapPtr.template getData< Device >(),
         auxPtr.template getData< Device>(),
         helpFunc.template modifyData< Device>(),
         BlockIterDevice,
         oddEvenBlock.getView() );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         auxPtr = helpFunc;
         
         oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
         
         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
         interfaceMapPtr.template getData< Device >(),
         auxPtr.template getData< Device>(),
         helpFunc.template modifyData< Device>(),
         BlockIterDevice,
         oddEvenBlock.getView() );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         auxPtr = helpFunc;
         
         oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
         
         CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         
         BlockIterD = dBlock.getElement( 0 );*/
        // True if we should calculate again.
        int calculateCudaBlocksAgain = 1;
        
        // Array that identifies which blocks should be calculated.
        // All blocks should calculate in first passage ( setValue(1) )
@@ -343,15 +279,8 @@ solve( const MeshPointer& mesh,
        MeshFunctionPointer helpFunc( mesh );
        helpFunc.template modifyData() = auxPtr.template getData(); 
        
        //int pocBloku = 0;
        Devices::Cuda::synchronizeDevice();
        CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
                interfaceMapPtr.template getData< Device >(),
                auxPtr.template modifyData< Device>(),
                helpFunc.template modifyData< Device>(),
                BlockIterDevice.getView() );
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
        // number of iterations of while calculateCudaBlocksAgain
        int numIter = 0;
               
        //int oddEvenBlock = 0;
        while( calculateCudaBlocksAgain )
@@ -390,44 +319,16 @@ solve( const MeshPointer& mesh,
          Devices::Cuda::synchronizeDevice();
          CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(),
                  auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(),
                  blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps );
                  blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps );
          cudaDeviceSynchronize();
          TNL_CHECK_CUDA_DEVICE;
          
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
        
        
        aux = *auxPtr;
        interfaceMap = *interfaceMapPtr;
#endif
      }

      
/**----------------------MPI-TO-DO---------------------------------------------**/
        
#ifdef HAVE_MPI
        //int i = MPI::GetRank( MPI::AllGroup );
        //TNL::Meshes::DistributedMeshes::DistributedMesh< MeshType > Mesh;
        int neighCount = 0; // should this thread calculate again?
        int calculpom[4] = {0,0,0,0};
        
          if( i == 0 ){
            BlockIterPom1 = BlockIterDevice;
            for( int i =0; i< numBlocksX; i++ ){
              for( int j = 0; j < numBlocksY; j++ )
              {
                std::cout << BlockIterPom1[j*numBlocksX + i];
              }
              std::cout << std::endl;
            }
            std::cout << std::endl;
          }
#endif
          // Switching helpFunc and auxPtr.
          auxPtr.swap( helpFunc );
          
          // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now.
          Devices::Cuda::synchronizeDevice(); 
          GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator, blockCalculationIndicatorHelp, numBlocksX, numBlocksY );
          GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY );
          cudaDeviceSynchronize();
          TNL_CHECK_CUDA_DEVICE;
          blockCalculationIndicator = blockCalculationIndicatorHelp;
@@ -445,46 +346,24 @@ solve( const MeshPointer& mesh,
/**-----------------------------------------------------------------------------------------------------------*/
          numIter ++;
        }
        if( numIter%2  == 1 ){
          auxPtr = helpFunc;
        }
        /*cudaFree( BlockIterDevice );
         cudaFree( dBlock );
         delete BlockIter;*/
        
        if( neigh[1] != -1 )
        if( numIter%2 == 1 ) // Need to check parity for MPI overlaps to synchronize ( otherwise doesnt work )
        {
          req[neighCount] = MPI::ISend( &calculated, 1, neigh[1], 0, MPI::AllGroup ); 
          neighCount++;
          
          
          req[neighCount] = MPI::IRecv( &calculpom[1], 1, neigh[1], 0, MPI::AllGroup );
          neighCount++;
          helpFunc.swap( auxPtr );
          Devices::Cuda::synchronizeDevice();
          cudaDeviceSynchronize();
          TNL_CHECK_CUDA_DEVICE;
        }
        
        if( neigh[2] != -1 )
        {
          req[neighCount] = MPI::ISend( &calculated, 1, neigh[2], 0, MPI::AllGroup );
          neighCount++;
          
          req[neighCount] = MPI::IRecv( &calculpom[2], 1, neigh[2], 0, MPI::AllGroup  );
          neighCount++;
        aux = *auxPtr;
        interfaceMap = *interfaceMapPtr;
#endif
      }

        if( neigh[5] != -1 )
        {
          req[neighCount] = MPI::ISend( &calculated, 1, neigh[5], 0, MPI::AllGroup );
          neighCount++;
      
          req[neighCount] = MPI::IRecv( &calculpom[3], 1, neigh[5], 0, MPI::AllGroup );
          neighCount++;
        }
/**----------------------MPI-TO-DO---------------------------------------------**/        
#ifdef HAVE_MPI
      if( CommunicatorType::isDistributed() ){
        getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh );
       
        MPI::WaitAll(req,neighCount);
#if ForDebug
        printf( "%d: Sending Calculated = %d.\n", i, calculated );
#endif        
        MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR,  MPI::AllGroup );
        aux.template synchronize< Communicator >();
      }
#endif
@@ -518,9 +397,16 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
#endif
}

template < typename Index >
__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY )



template< typename Real, typename Device, typename Index, 
          typename Communicator, typename Anisotropy >
bool 
FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
        MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
        const AnisotropyPointer& anisotropy )
{
  bool calculated = false;
  const MeshType& mesh = aux.getMesh();
@@ -548,97 +434,15 @@ __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, I
  return calculated;
}

template < typename Index >
__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks )
{
  int i = threadIdx.x;
  int blId = blockIdx.x;
  int blockSize = blockDim.x;
  /*if ( i == 0 && blId == 0 ){
    printf( "nBlocks = %d\n", nBlocks );
    for( int j = nBlocks-1; j > -1 ; j--){
      printf( "%d: cislo = %d \n", j, BlockIterDevice[ j ] );
    }
  }*/
  __shared__ int sArray[ 1024 ];
  sArray[ i ] = 0;
  if( blId * 1024 + i < nBlocks )
    sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
  __syncthreads();
  /*if ( i == 0 && blId == 0 ){
   printf( "nBlocks = %d\n", nBlocks );
   for( int j = 4; j > -1 ; j--){
   printf( "%d: cislo = %d \n", j, sArray[ j ] );
   }
  }*/
  /*extern __shared__ volatile int sArray[];
   unsigned int i = threadIdx.x;
   unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x;
   unsigned int gridSize = blockSize * 2 * gridDim.x;
   sArray[ i ] = 0;
   while( gid < nBlocks )
   {
   sArray[ i ] += BlockIterDevice[ gid ] + BlockIterDevice[ gid + blockSize ];
   gid += gridSize;
   }
   __syncthreads();*/
  
  if ( blockSize == 1024) {
    if (i < 512)
      sArray[ i ] += sArray[ i + 512 ];
  }
  __syncthreads();
  if (blockSize >= 512) {
    if (i < 256) {
      sArray[ i ] += sArray[ i + 256 ];
    }
  }
  __syncthreads();
  if (blockSize >= 256) {
    if (i < 128) {
      sArray[ i ] += sArray[ i + 128 ];
    }
  }
  __syncthreads();
  if (blockSize >= 128) {
    if (i < 64) {
      sArray[ i ] += sArray[ i + 64 ];
    }
  }
  __syncthreads();
  if (i < 32 )
  {
    if(  blockSize >= 64 ){ sArray[ i ] += sArray[ i + 32 ];}
  __syncthreads();
    if(  blockSize >= 32 ){  sArray[ i ] += sArray[ i + 16 ];}
  __syncthreads();
    if(  blockSize >= 16 ){  sArray[ i ] += sArray[ i + 8 ];}
    if(  blockSize >= 8 ){  sArray[ i ] += sArray[ i + 4 ];}
  __syncthreads();
    if(  blockSize >= 4 ){  sArray[ i ] += sArray[ i + 2 ];}
  __syncthreads();
    if(  blockSize >= 2 ){  sArray[ i ] += sArray[ i + 1 ];}
  __syncthreads();
  }
  __syncthreads();
  
  if( i == 0 )
    dBlock[ blId ] = sArray[ 0 ];
}



template < int sizeSArray, typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
        CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) );
        TNL_CHECK_CUDA_DEVICE;

        CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
        TNL_CHECK_CUDA_DEVICE;
#ifdef HAVE_MPI
template< typename Real, typename Device, typename Index, 
          typename Communicator, typename Anisotropy >
void 
FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
{
  Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
  
@@ -687,4 +491,3 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
              calculateFromNeighbours[2] || calculateFromNeighbours[3];
}
#endif
        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock )
+86 −35

File changed.

Preview size limit exceeded, changes collapsed.