Commit 5d1fab6e authored by Matouš Fencl's avatar Matouš Fencl
Browse files

CUDA implemented for exact number of loops in 2D and 3D

parent 6d1a8d98
Loading
Loading
Loading
Loading
+12 −11
Original line number Diff line number Diff line
@@ -68,8 +68,6 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
      __cuda_callable__ void updateCell( MeshFunctionType& u,
                                         const MeshEntity& cell,
                                         const RealType velocity = 1.0 );      
   protected:
       
};

template< typename Real,
@@ -78,7 +76,6 @@ template< typename Real,
class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
{
   public:
      
      typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
      typedef Real RealType;
      typedef Device DevcieType;
@@ -96,11 +93,6 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
      __cuda_callable__ void updateCell( MeshFunctionType& u,
                                         const MeshEntity& cell,
                                         const RealType velocity = 1.0);
      
      /*Real sort( Real a, Real b, Real c,
                 const RealType& ha,
                 const RealType& hb,
                 const RealType& hc ); */
};

template < typename T1, typename T2 >
@@ -112,7 +104,8 @@ __cuda_callable__ void sortMinims( T1 pom[] );

#ifdef HAVE_CUDA
template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux );

template < typename Real, typename Device, typename Index >
@@ -120,7 +113,15 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap );

//__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, double, TNL::Devices::Cuda, int > >& input );
template < typename Real, typename Device, typename Index >
__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );

template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux );
#endif

#include "tnlDirectEikonalMethodsBase_impl.h"
+280 −176
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@

#include <iostream>
#include "tnlFastSweepingMethod.h"

template< typename Real,
          typename Device,
          typename Index >
@@ -81,12 +82,6 @@ initInterface( const MeshFunctionPointer& _input,
               MeshFunctionPointer& _output,
               InterfaceMapPointer& _interfaceMap )
{
    /*
     doplnit přepočty pro cudu:
     * overit is_same device
     * na kazdy bod jedno cuda vlakno
     */
    
            
    if( std::is_same< Device, Devices::Cuda >::value )
    {
@@ -102,7 +97,6 @@ initInterface( const MeshFunctionPointer& _input,
        CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
                                                   _output.template modifyData< Device >(),
                                                   _interfaceMap.template modifyData< Device >() );
        //CudaInitCaller<<< gridSize, blockSize >>>( input );
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
#endif
@@ -215,6 +209,7 @@ template< typename Real,
          typename Device,
          typename Index >
   template< typename MeshEntity >
__cuda_callable__
void
tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
updateCell( MeshFunctionType& u,
@@ -300,6 +295,29 @@ tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
initInterface( const MeshFunctionPointer& _input,
               MeshFunctionPointer& _output,
               InterfaceMapPointer& _interfaceMap  )
{
    if( std::is_same< Device, Devices::Cuda >::value )
    {
#ifdef HAVE_CUDA
        const MeshType& mesh = _input->getMesh();
        
        const int cudaBlockSize( 8 );
        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
        int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
        if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
            std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
        dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
        dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
        Devices::Cuda::synchronizeDevice();
        CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
                                                     _output.template modifyData< Device >(),
                                                     _interfaceMap.template modifyData< Device >() );
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
#endif
    }
    if( std::is_same< Device, Devices::Host >::value )
    {
        const MeshFunctionType& input =  _input.getData();
        MeshFunctionType& output =  _output.modifyData();
@@ -452,11 +470,13 @@ initInterface( const MeshFunctionPointer& _input,
                 interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245
              }
    }
}

template< typename Real,
          typename Device,
          typename Index >
   template< typename MeshEntity >
__cuda_callable__
void
tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
updateCell( MeshFunctionType& u,
@@ -628,12 +648,13 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
        Cell cell( mesh );
        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
        cell.refresh();
        const Index cind = cell.getIndex();


        output[ cell.getIndex() ] =
        output[ cind ] =
               input( cell ) >= 0 ? TypeInfo< Real >::getMaxValue() :
                                    - TypeInfo< Real >::getMaxValue();
        interfaceMap[ cell.getIndex() ] = false; 
        interfaceMap[ cind ] = false; 

        const Real& hx = mesh.getSpaceSteps().x();
        const Real& hy = mesh.getSpaceSteps().y();
@@ -651,44 +672,127 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
           if( c * input[ n ] <= 0 )
           {
               pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
               if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
                   output[ cell.getIndex() ] = pom;
               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
                   output[ cind ] = pom;

               interfaceMap[ cell.getIndex() ] = true;
           }
           if( c * input[ e ] <= 0 )
           {
               pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
               if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
                   output[ cell.getIndex() ] = pom;                       
               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
                   output[ cind ] = pom;                       

               interfaceMap[ cell.getIndex() ] = true;
               interfaceMap[ cind ] = true;
           }
           if( c * input[ w ] <= 0 )
           {
               pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
               if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
                   output[ cell.getIndex() ] = pom;
               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
                   output[ cind ] = pom;

               interfaceMap[ cell.getIndex() ] = true;
               interfaceMap[ cind ] = true;
           }
           if( c * input[ s ] <= 0 )
           {
               pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
               if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
                   output[ cell.getIndex() ] = pom;
               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
                   output[ cind ] = pom;

               interfaceMap[ cell.getIndex() ] = true;
               interfaceMap[ cind ] = true;
           }
        }
    }
}


/*__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, double, TNL::Devices::Cuda, int > >& input )
template < typename Real, typename Device, typename Index >
__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap )
{
    int i = threadIdx.x + blockDim.x*blockIdx.x;
    int j = blockDim.y*blockIdx.y + threadIdx.y;
    //const Meshes::Grid< 2, double, TNL::Devices::Cuda, int >& mesh = input.getMesh();
    int k = blockDim.z*blockIdx.z + threadIdx.z;
    const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
    
}*/
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
    {
        typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
        Cell cell( mesh );
        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
        cell.refresh();
        const Index cind = cell.getIndex();


        output[ cind ] =
               input( cell ) >= 0 ? TypeInfo< Real >::getMaxValue() :
                                    - TypeInfo< Real >::getMaxValue();
        interfaceMap[ cind ] = false; 
        cell.refresh();

        const Real& hx = mesh.getSpaceSteps().x();
        const Real& hy = mesh.getSpaceSteps().y();
        const Real& hz = mesh.getSpaceSteps().z();
        const Real& c = input( cell );
        if( ! cell.isBoundaryEntity()  )
        {
           auto neighbors = cell.getNeighborEntities();
           Real pom = 0;
           const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
           const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
           const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
           const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
           const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
           const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
           
           if( c * input[ n ] <= 0 )
           {
               pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
                   output[ cind ] = pom;

               interfaceMap[ cind ] = true;
           }
           if( c * input[ e ] <= 0 )
           {
               pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
                   output[ cind ] = pom;                       

               interfaceMap[ cind ] = true;
           }
           if( c * input[ w ] <= 0 )
           {
               pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
                   output[ cind ] = pom;

               interfaceMap[ cind ] = true;
           }
           if( c * input[ s ] <= 0 )
           {
               pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
                   output[ cind ] = pom;

               interfaceMap[ cind ] = true;
           }
           if( c * input[ b ] <= 0 )
           {
               pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
                   output[ cind ] = pom;

               interfaceMap[ cind ] = true;
           }
           if( c * input[ t ] <= 0 )
           {
               pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
                   output[ cind ] = pom;

               interfaceMap[ cind ] = true;
           }
        }
    }
}
+2 −0
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <TNL/SharedPointer.h>
#include "tnlDirectEikonalMethodsBase.h"


template< typename Mesh,
          typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > >
class FastSweepingMethod
@@ -145,6 +146,7 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
      const IndexType maxIterations;
};


#include "tnlFastSweepingMethod1D_impl.h"
#include "tnlFastSweepingMethod2D_impl.h"
#include "tnlFastSweepingMethod3D_impl.h"
+30 −19
Original line number Diff line number Diff line
@@ -14,6 +14,9 @@
#pragma once

#include "tnlFastSweepingMethod.h"
#include <TNL/TypeInfo.h>
#include <TNL/Devices/Cuda.h>


template< typename Real,
          typename Device,
@@ -212,15 +215,23 @@ solve( const MeshPointer& mesh,
      {
         // TODO: CUDA code
#ifdef HAVE_CUDA
          /*int numBlocks = 2;
          int threadsPerBlock;
          if( mesh->getDimensions().x() >= mesh->getDimensions().y() )
               threadsPerBlock = (int)( mesh->getDimensions().x() );
          else
               threadsPerBlock = (int)( mesh->getDimensions().y() );
          const int cudaBlockSize( 16 );
          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
          dim3 blockSize( cudaBlockSize, cudaBlockSize );
          dim3 gridSize( numBlocksX, numBlocksY );
          Devices::Cuda::synchronizeDevice();
          int DIM = mesh->getDimensions().x();
          
          CudaUpdateCellCaller< Real, Device, Index ><<< numBlocks, threadsPerBlock >>>( interfaceMap, aux );
          cudaDeviceSynchronize(); //copak dela?*/
          tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
          for( int k = 0; k < numBlocksX; k++)
            CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr,
                                                                                    interfaceMapPtr.template getData< Device >(),
                                                                                    auxPtr.template modifyData< Device>() );
          cudaDeviceSynchronize();
          TNL_CHECK_CUDA_DEVICE;
          aux = *auxPtr;
          interfaceMap = *interfaceMapPtr;
#endif
      }
      iteration++;
@@ -228,30 +239,30 @@ solve( const MeshPointer& mesh,
   aux.save("aux-final.tnl");
}

//#ifdef HAVE_CUDA
template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux )
{
    int i = threadIdx.x + blockDim.x*blockIdx.x;
    int j = threadIdx.y + blockDim.y*blockIdx.y;
    const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.getMesh();
    int j = blockDim.y*blockIdx.y + threadIdx.y;
    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
    
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
    {
        //make cell of aux from index
        typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
        Cell cell( mesh );
        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
        cell.refresh();

        //update cell value few times 
        //for( int i = 0; i < mesh.getDimensions() ; i++ )
        //{
            cell.refresh();
        //tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
        for( int k = 0; k < 16; k++ )
        {
            if( ! interfaceMap( cell ) )
            {
        //        tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::updateCell( aux, cell );
               ptr.updateCell( aux, cell );
            }
        }
        //}
    }
}
//#endif
 No newline at end of file
+218 −162
Original line number Diff line number Diff line
@@ -21,7 +21,7 @@ template< typename Real,
          typename Anisotropy >
FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
FastSweepingMethod()
: maxIterations( 1 )
: maxIterations( 2 )
{
   
}
@@ -64,6 +64,7 @@ solve( const MeshPointer& mesh,
   interfaceMapPtr->setMesh( mesh );
   std::cout << "Initiating the interface cells ..." << std::endl;
   BaseType::initInterface( u, auxPtr, interfaceMapPtr );
   cudaDeviceSynchronize();
   auxPtr->save( "aux-ini.tnl" );   
   
   typename MeshType::Cell cell( *mesh );
@@ -72,6 +73,8 @@ solve( const MeshPointer& mesh,
   MeshFunctionType aux = *auxPtr;
   InterfaceMapType interfaceMap = * interfaceMapPtr;
    while( iteration < this->maxIterations )
    {
        if( std::is_same< DeviceType, Devices::Host >::value )
        {
           for( cell.getCoordinates().z() = 0;
                cell.getCoordinates().z() < mesh->getDimensions().z();
@@ -237,6 +240,33 @@ solve( const MeshPointer& mesh,
                 }
              }
           }
      }
      if( std::is_same< DeviceType, Devices::Cuda >::value )
      {
         // TODO: CUDA code
#ifdef HAVE_CUDA
          const int cudaBlockSize( 8 );
          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
          int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); 
          if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
              std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
          dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
          dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
          Devices::Cuda::synchronizeDevice();
          
          tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
          for( int k = 0; k < numBlocksX; k++)
          CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr,
                                                                                  interfaceMapPtr.template getData< Device >(),
                                                                                  auxPtr.template modifyData< Device>() );
          cudaDeviceSynchronize();
          TNL_CHECK_CUDA_DEVICE;
          aux = *auxPtr;
          interfaceMap = *interfaceMapPtr;
#endif
      }
        
      //aux.save( "aux-8.tnl" );
      iteration++;
      
@@ -244,3 +274,29 @@ solve( const MeshPointer& mesh,
   aux.save("aux-final.tnl");
}

template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux )
{
    int i = threadIdx.x + blockDim.x*blockIdx.x;
    int j = blockDim.y*blockIdx.y + threadIdx.y;
    int k = blockDim.z*blockIdx.z + threadIdx.z;
    const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
    
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
    {
        typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
        Cell cell( mesh );
        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
        cell.refresh();
        //tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
        for( int l = 0; l < 8; l++ )
        {
            if( ! interfaceMap( cell ) )
            {
               ptr.updateCell( aux, cell );
            }
        }
    }
}
 No newline at end of file