Commit 970e6448 authored by Matouš Fencl's avatar Matouš Fencl Committed by Tomáš Oberhuber
Browse files

FIM method implemented. Neighbours are being found on CPU. 3D parallel method...

FIM method implemented. Neighbours are being found on CPU. 3D parallel method disabled because of Array changes.
parent bf5e6fd6
Loading
Loading
Loading
Loading
+7 −2
Original line number Diff line number Diff line
@@ -113,6 +113,8 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double
template < typename T1 >
__cuda_callable__ void sortMinims( T1 pom[] );

template < typename Index >
void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY );

#ifdef HAVE_CUDA
template < typename Real, typename Device, typename Index >
@@ -130,8 +132,11 @@ template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
                                      int *BlockIterDevice, int oddEvenBlock);
__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks );
                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );

template < typename Index >
__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );

/*template < typename Real, typename Device, typename Index >
__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/
+125 −74
Original line number Diff line number Diff line
@@ -235,13 +235,6 @@ solve( const MeshPointer& mesh,
      {
         // TODO: CUDA code
#ifdef HAVE_CUDA
          
          Real *dAux;
          cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) );
          
          
          
          
          const int cudaBlockSize( 16 );
          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
@@ -250,18 +243,30 @@ solve( const MeshPointer& mesh,
          
          tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
          
          //aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 );
          
          //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) );
          TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter;
          BlockIter.setSize( numBlocksX * numBlocksY );
          BlockIter.setValue( 0 );
          /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) );
          for( int i = 0; i < numBlocksX*numBlocksY +1; i++)
              BlockIter[i] = 1;*/
          
          int *BlockIterDevice;
          int BlockIterD = 1;
          
          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
          BlockIterDevice.setSize( numBlocksX * numBlocksY );
          BlockIterDevice.setValue( 1 );
          /*int *BlockIterDevice;
          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );
          cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/
          
          int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
          int *dBlock;
          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
          int oddEvenBlock = 0;
          
          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
          dBlock.setSize( nBlocks );
          dBlock.setValue( 0 );
          /*int *dBlock;
          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
          
          while( BlockIterD )
          {
           /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
@@ -270,89 +275,132 @@ solve( const MeshPointer& mesh,
            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
                                                             interfaceMapPtr.template getData< Device >(),
                                                             auxPtr.template modifyData< Device>(),
                                                             BlockIterDevice,
                                                             oddEvenBlock );
                                                             BlockIterDevice );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
                                                             interfaceMapPtr.template getData< Device >(),
                                                             auxPtr.template modifyData< Device>(),
                                                             BlockIterDevice,
                                                             oddEvenBlock );
            
            BlockIter = BlockIterDevice;
            //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost);
            GetNeighbours( BlockIter, numBlocksX, numBlocksY );
            
            BlockIterDevice = BlockIter;
            //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
            
            
            CudaParallelReduc<<<  nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            
            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            
            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                   
            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
            
          }
          //aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 );
          cudaFree( dAux );
          cudaFree( BlockIterDevice );
          /*cudaFree( BlockIterDevice );
          cudaFree( dBlock );
          delete BlockIter;*/
          cudaDeviceSynchronize();
          
          TNL_CHECK_CUDA_DEVICE;
              
          //aux = *auxPtr;
          //interfaceMap = *interfaceMapPtr;
          aux = *auxPtr;
          interfaceMap = *interfaceMapPtr;
#endif
      }
      iteration++;
   }
   aux.save("aux-final.tnl");
}

#ifdef HAVE_CUDA
/*template < typename Real, typename Device, typename Index >
__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a )
template < typename Index >
void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY )
{
    TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom;
    BlockIterPom.setSize( numBlockX * numBlockY );
    BlockIterPom.setValue( 0 );
  /*int* BlockIterPom; 
  BlockIterPom = new int[numBlockX * numBlockY];*/
  /*for(int i = 0; i < numBlockX * numBlockY; i++)
    BlockIterPom[ i ] = 0;*/
  for(int i = 0; i < numBlockX * numBlockY; i++)
  {
    int i = threadIdx.x + blockDim.x*blockIdx.x;
    int j = blockDim.y*blockIdx.y + threadIdx.y;
    const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 )
      
      if( BlockIter[ i ] )
      {
        dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ];
          // i = k*numBlockY + m;
          int m=0, k=0;
          m = i%numBlockY;
          k = i/numBlockY;
          if( k > 0 && numBlockY > 1 )
            BlockIterPom[i - numBlockX] = 1;
          if( k < numBlockY-1 && numBlockY > 1 )
            BlockIterPom[i + numBlockX] = 1;
          
          if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 )
              BlockIterPom[ i+1 ] = 1;
          if( m <= numBlockX -1 && m > 0 && numBlockX > 1 )
              BlockIterPom[ i-1 ] = 1;
      }
  }
  for(int i = 0; i < numBlockX * numBlockY; i++ ){
///      if( !BlockIter[ i ] )
        BlockIter[ i ] = BlockIterPom[ i ];
///      else
///        BlockIter[ i ] = 0;
  }
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 )
  /*for( int i = numBlockX-1; i > -1; i-- )
  {
        aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ];
      for( int j = 0; j< numBlockY; j++ )
          std::cout << BlockIter[ i*numBlockY + j ];
      std::cout << std::endl;
  }
  std::cout << std::endl;*/
  //delete[] BlockIterPom;
}

}*/

__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks )
#ifdef HAVE_CUDA
template < typename Index >
__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks )
{
    int i = threadIdx.x;
    int blId = blockIdx.x;
    /*if ( i == 0 && blId == 0 ){
            printf( "nBlocks = %d \n", nBlocks );
        for( int j = nBlocks-1; j > -1 ; j--){
            printf( "cislo = %d \n", BlockIterDevice[ j ] );
        }
    }*/
    __shared__ volatile int sArray[ 512 ];
    sArray[ i ] = false;
    if(blId * 1024 + i < nBlocks )
        sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
    sArray[ i ] = 0;
    if( blId * 512 + i < nBlocks )
        sArray[ i ] = BlockIterDevice[ blId * 512 + i ];
    __syncthreads();
    
    if (blockDim.x * blockDim.y == 1024) {
    if (blockDim.x == 1024) {
        if (i < 512)
            sArray[ i ] += sArray[ i ];
            sArray[ i ] += sArray[ i + 512 ];
    }
    __syncthreads();
    if (blockDim.x * blockDim.y >= 512) {
    if (blockDim.x >= 512) {
        if (i < 256) {
            sArray[ i ] += sArray[ i ];
            sArray[ i ] += sArray[ i + 256 ];
        }
    }
    if (blockDim.x * blockDim.y >= 256) {
    __syncthreads();
    if (blockDim.x >= 256) {
        if (i < 128) {
            sArray[ i ] += sArray[ i + 128 ];
        }
    }
    __syncthreads();
    if (blockDim.x * blockDim.y >= 128) {
    if (blockDim.x >= 128) {
        if (i < 64) {
            sArray[ i ] += sArray[ i + 64 ];
        }
@@ -360,12 +408,12 @@ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlock
    __syncthreads();
    if (i < 32 )
    {
        if(  blockDim.x * blockDim.y >= 64 ) sArray[ i ] += sArray[ i + 32 ];
        if(  blockDim.x * blockDim.y >= 32 )  sArray[ i ] += sArray[ i + 16 ];
        if(  blockDim.x * blockDim.y >= 16 )  sArray[ i ] += sArray[ i + 8 ];
        if(  blockDim.x * blockDim.y >= 8 )  sArray[ i ] += sArray[ i + 4 ];
        if(  blockDim.x * blockDim.y >= 4 )  sArray[ i ] += sArray[ i + 2 ];
        if(  blockDim.x * blockDim.y >= 2 )  sArray[ i ] += sArray[ i + 1 ];
        if(  blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ];
        if(  blockDim.x >= 32 )  sArray[ i ] += sArray[ i + 16 ];
        if(  blockDim.x >= 16 )  sArray[ i ] += sArray[ i + 8 ];
        if(  blockDim.x >= 8 )  sArray[ i ] += sArray[ i + 4 ];
        if(  blockDim.x >= 4 )  sArray[ i ] += sArray[ i + 2 ];
        if(  blockDim.x >= 2 )  sArray[ i ] += sArray[ i + 1 ];
    }
    
    if( i == 0 )
@@ -378,14 +426,15 @@ template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
                                      int *BlockIterDevice, int oddEvenBlock )
                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
{
    int thri = threadIdx.x; int thrj = threadIdx.y;
    int blIdx = blockIdx.x; int blIdy = blockIdx.y;
    int i = thri + blockDim.x*blIdx;
    int j = blockDim.y*blIdy + thrj;
    int currentIndex = thrj * blockDim.x + thri;
    
    if( BlockIterDevice[ blIdy * gridDim.x + blIdx] )
    {
    //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
    __shared__ volatile bool changed[16*16];
    changed[ currentIndex ] = false;
@@ -424,12 +473,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<

        if( numOfBlocky -1 == blIdy )
            ykolik = dimY - (blIdy)*blockDim.y+1;
        BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
        //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
    }
    __syncthreads();
    
    if( (blIdy%2  + blIdx) % 2 == oddEvenBlock )
    {
        if(thri == 0 && thrj == 0 )
            BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;

        if( thri == 0 )
        {        
@@ -528,14 +577,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
                if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
                if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
            }
            if( changed[ 0 ] && thri == 0 && thrj == 0 )
            if( changed[ 0 ] && thri == 0 && thrj == 0 ){
                BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
            }
            __syncthreads();
        }

        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
            aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];

    }
    /*if( thri == 0 && thrj == 0 )
        printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/
}
#endif
+2 −2
Original line number Diff line number Diff line
@@ -272,8 +272,8 @@ solve( const MeshPointer& mesh,
                                                              interfaceMapPtr.template getData< Device >(),
                                                              auxPtr.template modifyData< Device>(),
                                                              BlockIterDevice );
            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
            //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
            //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
            
            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);