Commit e3c970e1 authored by Matouš Fencl's avatar Matouš Fencl Committed by Tomáš Oberhuber
Browse files

FIM implemented in 3D

parent 39b4889c
Loading
Loading
Loading
Loading
+8 −2
Original line number Diff line number Diff line
@@ -160,11 +160,17 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );

template < typename Real, typename Device, typename Index >
template < int sizeSArray, typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );

template < typename Index >
__global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,
        int numBlockX, int numBlockY, int numBlockZ );
#endif

#include "tnlDirectEikonalMethodsBase_impl.h"
+7 −9
Original line number Diff line number Diff line
@@ -85,7 +85,7 @@ solve( const MeshPointer& mesh,
  {
    if( std::is_same< DeviceType, Devices::Host >::value )
    {
      int numThreadsPerBlock = 1024;
      int numThreadsPerBlock = 16;
      
      
      int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
@@ -115,13 +115,13 @@ solve( const MeshPointer& mesh,
       }
       std::cout<<std::endl;*/
      unsigned int numWhile = 0;
      while( IsCalculationDone  )
      while( IsCalculationDone && numWhile < 1 )
      {      
        IsCalculationDone = 0;
        helpFunc1 = auxPtr;
        auxPtr = helpFunc;
        helpFunc = helpFunc1;
        this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
        this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
        
        //Reduction      
        for( int i = 0; i < BlockIterHost.getSize(); i++ ){
@@ -394,9 +394,7 @@ solve( const MeshPointer& mesh,
        numIter ++;
      }
      if( numIter == 1 ){
        helpFunc1 = auxPtr;
        auxPtr = helpFunc;
        helpFunc = helpFunc1;
      }
      /*cudaFree( BlockIterDevice );
       cudaFree( dBlock );
@@ -535,10 +533,10 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
    __syncthreads();
    /**-----------------------------------------*/
    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
    __shared__ volatile int dimX;
    __shared__ volatile int dimY;
    __shared__ volatile Real hx;
    __shared__ volatile Real hy;
    __shared__ int dimX;
    __shared__ int dimY;
    __shared__ Real hx;
    __shared__ Real hy;
    if( thri==0 && thrj == 0)
    {
      dimX = mesh.getDimensions().x();
+463 −418
Original line number Diff line number Diff line
@@ -264,6 +264,9 @@ solve( const MeshPointer& mesh,
      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
      BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
      BlockIterDevice.setValue( 1 );
      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
      BlockIterPom.setSize( numBlocksX * numBlocksY * numBlocksZ );
      BlockIterPom.setValue( 0 );
      /*int *BlockIterDevice;
       cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
      int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
@@ -271,18 +274,38 @@ solve( const MeshPointer& mesh,
      TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
      dBlock.setSize( nBlocks );
      dBlock.setValue( 0 );
      
      int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0);
      /*int *dBlock;
       cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
      MeshFunctionPointer helpFunc1( mesh );      
      MeshFunctionPointer helpFunc( mesh );
      
      helpFunc1 = auxPtr;
      auxPtr = helpFunc;
      helpFunc = helpFunc1;
      int numIter = 0;
      
      while( BlockIterD )
      {
             CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
        helpFunc1 = auxPtr;
        auxPtr = helpFunc;
        helpFunc = helpFunc1;
        TNL_CHECK_CUDA_DEVICE;
        
        CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
                interfaceMapPtr.template getData< Device >(),
                                                              auxPtr.template modifyData< Device>(),
                auxPtr.template getData< Device>(),
                helpFunc.template modifyData< Device>(),
                BlockIterDevice );
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
        
        GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY, numBlocksZ );
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
        BlockIterDevice = BlockIterPom;
        
        CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
@@ -291,11 +314,14 @@ solve( const MeshPointer& mesh,
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
        cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                   
        numIter++;
        /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
         BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
        
      }
      if( numIter == 1 ){
        auxPtr = helpFunc;
      }
      //cudaFree( BlockIterDevice );
      //cudaFree( dBlock );
      cudaDeviceSynchronize();
@@ -313,10 +339,43 @@ solve( const MeshPointer& mesh,
}

#ifdef HAVE_CUDA
template < typename Real, typename Device, typename Index >
template < typename Index >
__global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,
        int numBlockX, int numBlockY, int numBlockZ )
{
  int i = blockIdx.x * 1024 + threadIdx.x;
  
  if( i < numBlockX * numBlockY * numBlockZ )
  {
    int pom = 0;//BlockIterPom[ i ] = 0;
    int m=0, l=0, k=0;
    l = i/( numBlockX * numBlockY );
    k = (i-l*numBlockX * numBlockY )/(numBlockX );
    m = (i-l*numBlockX * numBlockY )%( numBlockX );
    if( m > 0 && BlockIterDevice[ i - 1 ] ){
      pom = 1;//BlockIterPom[ i ] = 1;
    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){
      pom = 1;//BlockIterPom[ i ] = 1;
    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){
      pom = 1;// BlockIterPom[ i ] = 1;
    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){
      pom = 1;//BlockIterPom[ i ] = 1;
    }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){
      pom = 1;
    }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){
      pom = 1;
    }
    
    BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
  }
}

template < int sizeSArray, typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
{
  int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
@@ -326,62 +385,54 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
  int k = blockDim.z*blockIdx.z + threadIdx.z;
  int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
  
    __shared__ volatile bool changed[8*8*8];
    changed[ currentIndex ] = false;
  if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] )
  {
    __syncthreads();
    
    __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)];
    
    changed[ currentIndex ] = false;
    if( thrj == 0 && thri == 0 && thrk == 0 )
      changed[ 0 ] = true;
    
    const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
    __shared__ Real hx;
    __shared__ Real hy;
    __shared__ Real hz;
    __shared__ Real hx; __shared__ int dimX;
    __shared__ Real hy; __shared__ int dimY;
    __shared__ Real hz; __shared__ int dimZ;
    
    if( thrj == 1 && thri == 1 && thrk == 1 )
    {
      hx = mesh.getSpaceSteps().x();
      hy = mesh.getSpaceSteps().y();
      hz = mesh.getSpaceSteps().z();
      dimX = mesh.getDimensions().x();
      dimY = mesh.getDimensions().y();
      dimZ = mesh.getDimensions().z();
      BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0;
    }
    __shared__ volatile Real sArray[10][10][10];
    sArray[thrk][thrj][thri] = std::numeric_limits< Real >::max();
    if(thri == 0 )
    {
        sArray[8][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
        sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
        sArray[thrk+1][thrj+1][8] = std::numeric_limits< Real >::max();
        sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
        sArray[thrj+1][8][thrk+1] = std::numeric_limits< Real >::max();
        sArray[thrj+1][9][thrk+1] = std::numeric_limits< Real >::max();
    }
    __shared__ volatile Real sArray[sizeSArray][sizeSArray][sizeSArray];
    sArray[thrk+1][thrj+1][thri+1] = std::numeric_limits< Real >::max();
    
    //filling sArray edges
    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
    int dimZ = mesh.getDimensions().z();
    __shared__ volatile int numOfBlockx;
    __shared__ volatile int numOfBlocky;
    __shared__ volatile int numOfBlockz;
    __shared__ int xkolik;
    __shared__ int ykolik;
    __shared__ int zkolik;
    if( thri == 0 && thrj == 0 && thrk == 0 )
    {
    int numOfBlockx;
    int numOfBlocky;
    int numOfBlockz;
    int xkolik;
    int ykolik;
    int zkolik;
    xkolik = blockDim.x + 1;
    ykolik = blockDim.y + 1;
    zkolik = blockDim.z + 1;
        numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
        numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
        numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0);
    numOfBlockx = gridDim.x;
    numOfBlocky = gridDim.y;
    numOfBlockz = gridDim.z;
    
    if( numOfBlockx - 1 == blIdx )
      xkolik = dimX - (blIdx)*blockDim.x+1;

    if( numOfBlocky -1 == blIdy )
      ykolik = dimY - (blIdy)*blockDim.y+1;
    if( numOfBlockz-1 == blIdz )
      zkolik = dimZ - (blIdz)*blockDim.z+1;
        
        BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0;
    }
    __syncthreads();
    
    if( thri == 0 )
@@ -430,12 +481,10 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
        sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
    }
    
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
    if( i < dimX && j < dimY && k < dimZ )
    {
      sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
    }
    __shared__ volatile int loopcounter;
    loopcounter = 0;
    __syncthreads(); 
    while( changed[ 0 ] )
    {
@@ -510,17 +559,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       }*/
      if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
      {
            //loopcounter++;
        BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
      }
      __syncthreads();
        /*if(thri == 0 && thrj==0 && thrk==0)
            printf("%i \n",loopcounter);
        if(loopcounter == 500)
            break;*/
    }
    
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) )
        aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
    if( i < dimX && j < dimY && k < dimZ )
      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
  } 
}  
#endif