Commit 3f14cbd0 authored by Matouš Fencl's avatar Matouš Fencl
Browse files

Merge branch 'hamilton-jacobi' of jlk.fjfi.cvut.cz:mmg/tnl-dev into hamilton-jacobi

parents 444e01c4 da336fb8
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -138,7 +138,11 @@ template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
<<<<<<< HEAD
                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 );
=======
                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf

template < typename Index >
__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+283 −1
Original line number Diff line number Diff line
@@ -187,6 +187,7 @@ solve( const MeshPointer& mesh,
    {
      // TODO: CUDA code
#ifdef HAVE_CUDA
<<<<<<< HEAD
      TNL_CHECK_CUDA_DEVICE;
      const int cudaBlockSize( 16 );
      int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
@@ -271,6 +272,86 @@ solve( const MeshPointer& mesh,
        /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
         BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
        //pocIter ++;
=======
          const int cudaBlockSize( 16 );
          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
          dim3 blockSize( cudaBlockSize, cudaBlockSize );
          dim3 gridSize( numBlocksX, numBlocksY );
          
          tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
          
          TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter;
          BlockIter.setSize( numBlocksX * numBlocksY );
          BlockIter.setValue( 0 );
          /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) );
          for( int i = 0; i < numBlocksX*numBlocksY +1; i++)
              BlockIter[i] = 1;*/
          
          int BlockIterD = 1;
          
          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
          BlockIterDevice.setSize( numBlocksX * numBlocksY );
          BlockIterDevice.setValue( 1 );
          /*int *BlockIterDevice;
          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );
          cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/
          
          int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
          
          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
          dBlock.setSize( nBlocks );
          dBlock.setValue( 0 );
          /*int *dBlock;
          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
          
          while( BlockIterD )
          {
           /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
                BlockIter[ i ] = false;*/
                       
            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
                                                             interfaceMapPtr.template getData< Device >(),
                                                             auxPtr.template modifyData< Device>(),
                                                             BlockIterDevice );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            
            BlockIter = BlockIterDevice;
            //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost);
            GetNeighbours( BlockIter, numBlocksX, numBlocksY );
            
            BlockIterDevice = BlockIter;
            //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            
            
            CudaParallelReduc<<<  nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            
            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            
            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                   
            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
            
          }
          /*cudaFree( BlockIterDevice );
          cudaFree( dBlock );
          delete BlockIter;*/
          cudaDeviceSynchronize();
          
          TNL_CHECK_CUDA_DEVICE;
              
          aux = *auxPtr;
          interfaceMap = *interfaceMapPtr;
#endif
>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
      }
      cudaDeviceSynchronize();
      TNL_CHECK_CUDA_DEVICE;
@@ -285,6 +366,7 @@ solve( const MeshPointer& mesh,
  }
  aux.save("aux-final.tnl");
}
<<<<<<< HEAD

#ifdef HAVE_CUDA
template < typename Index >
@@ -318,10 +400,60 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index
  }
}

=======
template < typename Index >
void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY )
{
    TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom;
    BlockIterPom.setSize( numBlockX * numBlockY );
    BlockIterPom.setValue( 0 );
  /*int* BlockIterPom; 
  BlockIterPom = new int[numBlockX * numBlockY];*/
  /*for(int i = 0; i < numBlockX * numBlockY; i++)
    BlockIterPom[ i ] = 0;*/
  for(int i = 0; i < numBlockX * numBlockY; i++)
  {
      
      if( BlockIter[ i ] )
      {
          // i = k*numBlockY + m;
          int m=0, k=0;
          m = i%numBlockY;
          k = i/numBlockY;
          if( k > 0 && numBlockY > 1 )
            BlockIterPom[i - numBlockX] = 1;
          if( k < numBlockY-1 && numBlockY > 1 )
            BlockIterPom[i + numBlockX] = 1;
          
          if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 )
              BlockIterPom[ i+1 ] = 1;
          if( m <= numBlockX -1 && m > 0 && numBlockX > 1 )
              BlockIterPom[ i-1 ] = 1;
      }
  }
  for(int i = 0; i < numBlockX * numBlockY; i++ ){
///      if( !BlockIter[ i ] )
        BlockIter[ i ] = BlockIterPom[ i ];
///      else
///        BlockIter[ i ] = 0;
  }
  /*for( int i = numBlockX-1; i > -1; i-- )
  {
      for( int j = 0; j< numBlockY; j++ )
          std::cout << BlockIter[ i*numBlockY + j ];
      std::cout << std::endl;
  }
  std::cout << std::endl;*/
  //delete[] BlockIterPom;
}

#ifdef HAVE_CUDA
>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
template < typename Index >
__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks )
{
<<<<<<< HEAD
  int i = threadIdx.x;
  int blId = blockIdx.x;
  __shared__ volatile int sArray[ 512 ];
@@ -363,6 +495,57 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I
  
  if( i == 0 )
    dBlock[ blId ] = sArray[ 0 ];
=======
    int i = threadIdx.x;
    int blId = blockIdx.x;
    /*if ( i == 0 && blId == 0 ){
            printf( "nBlocks = %d \n", nBlocks );
        for( int j = nBlocks-1; j > -1 ; j--){
            printf( "cislo = %d \n", BlockIterDevice[ j ] );
        }
    }*/
    __shared__ volatile int sArray[ 512 ];
    sArray[ i ] = 0;
    if( blId * 512 + i < nBlocks )
        sArray[ i ] = BlockIterDevice[ blId * 512 + i ];
    __syncthreads();
    
    if (blockDim.x == 1024) {
        if (i < 512)
            sArray[ i ] += sArray[ i + 512 ];
    }
    __syncthreads();
    if (blockDim.x >= 512) {
        if (i < 256) {
            sArray[ i ] += sArray[ i + 256 ];
        }
    }
    __syncthreads();
    if (blockDim.x >= 256) {
        if (i < 128) {
            sArray[ i ] += sArray[ i + 128 ];
        }
    }
    __syncthreads();
    if (blockDim.x >= 128) {
        if (i < 64) {
            sArray[ i ] += sArray[ i + 64 ];
        }
    }
    __syncthreads();
    if (i < 32 )
    {
        if(  blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ];
        if(  blockDim.x >= 32 )  sArray[ i ] += sArray[ i + 16 ];
        if(  blockDim.x >= 16 )  sArray[ i ] += sArray[ i + 8 ];
        if(  blockDim.x >= 8 )  sArray[ i ] += sArray[ i + 4 ];
        if(  blockDim.x >= 4 )  sArray[ i ] += sArray[ i + 2 ];
        if(  blockDim.x >= 2 )  sArray[ i ] += sArray[ i + 1 ];
    }
    
    if( i == 0 )
        dBlock[ blId ] = sArray[ 0 ];
>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
}


@@ -371,7 +554,11 @@ template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
<<<<<<< HEAD
                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne )
=======
                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
{
  int thri = threadIdx.x; int thrj = threadIdx.y;
  int blIdx = blockIdx.x; int blIdy = blockIdx.y;
@@ -408,7 +595,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
    int i = thri + blockDim.x*blIdx;
    int j = blockDim.y*blIdy + thrj;
    int currentIndex = thrj * blockDim.x + thri;
    
    if( BlockIterDevice[ blIdy * gridDim.x + blIdx] )
    {
    //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
    __shared__ volatile bool changed[16*16];
    changed[ currentIndex ] = false;
@@ -444,14 +632,24 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
        sArray[thrj+1][0] = std::numeric_limits< Real >::max();
    }
    
<<<<<<< HEAD
    if( thri == 2 )
    {
      if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik && NE == 1 )
        sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
      else
        sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
=======
        if( numOfBlockx - 1 == blIdx )
            xkolik = dimX - (blIdx)*blockDim.x+1;

        if( numOfBlocky -1 == blIdy )
            ykolik = dimY - (blIdy)*blockDim.y+1;
        //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
    }
    
<<<<<<< HEAD
    if( thri == 3 )
    {
      if( blIdy != 0 && thrj+1 < xkolik && NE == 1 )
@@ -477,6 +675,20 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
      if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
      {
        if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
=======
        if(thri == 0 && thrj == 0 )
            BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;

        if( thri == 0 )
        {        
            if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik )
                sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
            else
                sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
        }

        if( thri == 1 )
>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
        {
          changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
        }
@@ -512,6 +724,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
      {
        if( currentIndex < 64 )
        {
<<<<<<< HEAD
          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
        }
      }
@@ -533,5 +746,74 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
      aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
  }
=======
            __syncthreads();

            changed[ currentIndex] = false;

        //calculation of update cell
            if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
            {
                if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
                {
                    changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
                }
            }
            __syncthreads();

        //pyramid reduction
            if( blockDim.x*blockDim.y == 1024 )
            {
                if( currentIndex < 512 )
                {
                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
                }
            }
            __syncthreads();
            if( blockDim.x*blockDim.y >= 512 )
            {
                if( currentIndex < 256 )
                {
                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
                }
            }
            __syncthreads();
            if( blockDim.x*blockDim.y >= 256 )
            {
                if( currentIndex < 128 )
                {
                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
                }
            }
            __syncthreads();
            if( blockDim.x*blockDim.y >= 128 )
            {
                if( currentIndex < 64 )
                {
                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
                }
            }
            __syncthreads();
            if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
            {
                if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
                if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
                if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
                if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
                if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
                if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
            }
            if( changed[ 0 ] && thri == 0 && thrj == 0 ){
                BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
            }
            __syncthreads();
        }

        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
            aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
    }
    /*if( thri == 0 && thrj == 0 )
        printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/
>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
}
#endif
+5 −0
Original line number Diff line number Diff line
@@ -280,12 +280,17 @@ solve( const MeshPointer& mesh,
                                                              interfaceMapPtr.template getData< Device >(),
                                                              auxPtr.template modifyData< Device>(),
                                                              BlockIterDevice );
<<<<<<< HEAD
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            
            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
=======
            //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
            //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
            
            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
            cudaDeviceSynchronize();