Commit 9c196197 authored by Matouš Fencl's avatar Matouš Fencl
Browse files

Implementation of Blocks 18 * 18 using shared memory in 2D.

parent 5d1fab6e
Loading
Loading
Loading
Loading
+11 −1
Original line number Diff line number Diff line
@@ -68,6 +68,16 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
      __cuda_callable__ void updateCell( MeshFunctionType& u,
                                         const MeshEntity& cell,
                                         const RealType velocity = 1.0 );
      
      __cuda_callable__ void updateCell( Real sArray[18][18],
                                         int thri, int thrj, const Real hx, const Real hy,
                                         const Real velocity = 1.0 );
      
      __cuda_callable__ void setsArray( MeshFunctionType& aux, Real sArray[18][18],
                                            int dimX, int dimY, int i, int j );
      
      /*__cuda_callable__ void getsArray( MeshFunctionType& aux, Real sArray[18][18],
                                            int dimX, int dimY, int i, int j );*/
};

template< typename Real,
+229 −2
Original line number Diff line number Diff line
@@ -218,7 +218,6 @@ updateCell( MeshFunctionType& u,
{
   const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
   const MeshType& mesh = cell.getMesh();
  
   const RealType& hx = mesh.getSpaceSteps().x();
   const RealType& hy = mesh.getSpaceSteps().y();
   const RealType value = u( cell );
@@ -796,3 +795,231 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
        }
    }
}

template< typename Real,
          typename Device,
          typename Index >
__cuda_callable__ void
tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
setsArray( MeshFunctionType& aux, Real sArray[18][18], int dimX, int dimY, int blockIdx, int blockIdy )
{
    int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0);
    int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0);
    int xkolik = 18;
    int ykolik = 18;
    int xOd = 0;
    int yOd = 0;
    
    if( blockIdx == 0 )
        xOd = 1;
    if( blockIdy == 0 )
        yOd = 1;
    
    if( numOfBlockx - 1 == blockIdx )
        xkolik = dimX - (numOfBlockx-1)*16+1;

    if( numOfBlocky -1 == blockIdy )
        ykolik = dimY - (numOfBlocky-1)*16+1;
    
    if( dimX > (blockIdx+1) * 16 )
    {
        for( int i = yOd; i < ykolik; i++ )
        {
            sArray[i][17] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + i*dimX + 17 ];
        }
    }
    if( blockIdx != 0 )
    {
        for( int i = yOd; i < ykolik; i++ )
        {
            sArray[i][0] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + i*dimX + 0];
        }
    } 
    if( dimY > (blockIdy+1) * 16 )
    {
        for( int i = xOd; i < xkolik; i++ )
        {
            sArray[17][i] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + 17*dimX + i ];
        }
    }
    if( blockIdy != 0 )
    {
        for( int i = xOd; i < xkolik; i++ )
        {
            sArray[0][i] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + 0*dimX + i ];
        }
    }
    
    /*int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0);
    int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0);
    int xkolik = 18;
    int ykolik = 18;

    if( numOfBlockx - 1 == blockIdx )
        xkolik = dimX - (numOfBlockx-1)*16+1;

    if( numOfBlocky -1 == blockIdy )
        ykolik = dimY - (numOfBlocky-1)*16+1;


    if( numOfBlockx == 0 || numOfBlocky == 0 )
        return;
    else
    {
        if( blockIdx == 0 )
        {
            if( blockIdy == 0 )
            {
                for( int j = 0; j < ykolik-1; j++ )
                  for( int i = 0; i < xkolik-1; i++ )
                    {
                      sArray[ j+1 ][ i+1 ] = aux[ j*dimX + i ];
                    }

                //vypis( *sArray );
            }
            else
            {
                for( int j = 0; j < ykolik; j++ )
                  for( int i = 0; i < xkolik-1; i++ )
                    {
                      sArray[ j ][ i+1 ] = aux[ blockIdy*16*dimX - dimX + j*dimX + i ];
                    }
            }
        }
        else if( blockIdy == 0 )
        {
            for( int j = 0; j < ykolik-1; j++ )
              for( int i = 0; i < xkolik; i++ )
                 {
                   sArray[ j+1 ][ i ] = aux[ blockIdx*16 - 1 + j*dimX + i ];
                 }
        }
        else
        {
           for( int j = 0; j < ykolik; j++ )
              for( int i = 0; i < xkolik; i++ )
                {
                  //if( dimX*dimY-1 >  blockIdy*16*dimX - dimX + blockIdx*16 - 1 + j*dimX + i )
                    sArray[ j ][ i ] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + j*dimX + i ];
                }
        }
    }*/
}

/*template< typename Real,
          typename Device,
          typename Index >
__cuda_callable__ void
tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
getsArray( MeshFunctionType& aux, Real sArray[18][18], int dimX, int dimY, int blockIdx, int blockIdy )
{
    int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0);
    int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0);
    int xkolik = 18;
    int ykolik = 18;
    
    if( numOfBlockx - 1 == blockIdx )
        xkolik = dimX - (numOfBlockx-1)*16+2;

    if( numOfBlocky -1 == blockIdy )
        ykolik = dimY - (numOfBlocky-1)*16+2;

    if( numOfBlockx == 0 || numOfBlocky == 0 )
        return;
    else
    {
        if( blockIdx == 0 )
        {
            if( blockIdy == 0 )
            {
               for( int j = 0; j < ykolik-2; j++ )
                  for( int i = 0; i < xkolik-2; i++ )
                    {
                      aux[ j*dimX + i ] = sArray[ j+1 ][ i+1 ];
                    }
            }
            else
            {
               for( int j = 1; j < ykolik-1; j++ )
                  for( int i = 0; i < xkolik-2; i++ )
                    {
                      aux[ blockIdy*16*dimX - dimX + j*dimX + i ] = sArray[ j ][ i+1 ];
                    }
            }
        }
        else if( blockIdy == 0 )
        {
            for( int j = 0; j < ykolik-2; j++ )
               for( int i = 1; i < xkolik-1; i++ )
                 {
                   aux[ blockIdx*16 - 1 + j*dimX + i ] = sArray[ j+1 ][ i ];
                 }
        }
        else
        {
            for( int j = 1; j < ykolik-1; j++ )    
              for( int i = 1; i < xkolik-1; i++ )
                {
                    aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + j*dimX + i ] = sArray[ j ][ i ];
                }
        }
    }
}*/

template< typename Real,
          typename Device,
          typename Index >
__cuda_callable__
void
tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
updateCell( Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy,
            const Real v )
{
   
   //const Meshes::Grid< 2, Real, Device, Index >& mesh = u.template getMesh< Devices::Cuda >();
   /*typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
   Cell cell( mesh );
   cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
   cell.refresh();
   
   const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
   const RealType& hx = mesh.getSpaceSteps().x();
   const RealType& hy = mesh.getSpaceSteps().y();*/
   const RealType value = sArray[ thrj ][ thri ];//u( cell );
   RealType a, b, tmp = TypeInfo< RealType >::getMaxValue();
   
   /*if( value != u[ cell.getIndex() ] )
       return;*/
   
   b = TNL::argAbsMin( sArray[ thrj+1 ][ thri ],
                        sArray[ thrj-1 ][ thri ] );
    
   a = TNL::argAbsMin( sArray[ thrj ][ thri+1 ],
                        sArray[ thrj ][ thri-1 ] );


    
    if( fabs( a ) == TypeInfo< RealType >::getMaxValue() && 
        fabs( b ) == TypeInfo< RealType >::getMaxValue() )
       return;
   
    RealType pom[6] = { a, b, TypeInfo< Real >::getMaxValue(), (RealType)hx, (RealType)hy, 0.0 };
    sortMinims( pom );
    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
    
                                
    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
    {
        //u[ cell.getIndex() ]= argAbsMin( value, tmp );
        sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
    }
    else
    {
        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
        //u[ cell.getIndex() ]= argAbsMin( value, tmp );
        sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
    }
}
 No newline at end of file
+0 −1
Original line number Diff line number Diff line
@@ -99,7 +99,6 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
                  const AnisotropyPointer& anisotropy,
                  MeshFunctionPointer& u );
      
      
   protected:
      
      const IndexType maxIterations;
+34 −13
Original line number Diff line number Diff line
@@ -221,11 +221,12 @@ solve( const MeshPointer& mesh,
          dim3 blockSize( cudaBlockSize, cudaBlockSize );
          dim3 gridSize( numBlocksX, numBlocksY );
          Devices::Cuda::synchronizeDevice();
          int DIM = mesh->getDimensions().x();
          
          tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
          for( int k = 0; k < numBlocksX; k++)
            CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr,
          int nBlockIter = numBlocksX > numBlocksY ? numBlocksX : numBlocksY;
          nBlockIter = numBlocksX == numBlocksY ? nBlockIter + 1 : nBlockIter;
          for( int k = 0; k < nBlockIter; k++)
            CudaUpdateCellCaller<<< gridSize, blockSize, 18 * 18 * sizeof( Real ) >>>( ptr,
                                                                                    interfaceMapPtr.template getData< Device >(),
                                                                                    auxPtr.template modifyData< Device>() );
          cudaDeviceSynchronize();
@@ -245,24 +246,44 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux )
{
    int i = threadIdx.x + blockDim.x*blockIdx.x;
    int j = blockDim.y*blockIdx.y + threadIdx.y;
    int thri = threadIdx.x; int thrj = threadIdx.y; // nelze ke stejnym pristupovat znovu pres threadIdx (vzdy da jine hodnoty)
    int blIdx = blockIdx.x; int blIdy = blockIdx.y;
    int i = thri + blockDim.x*blIdx;
    int j = blockDim.y*blIdy + thrj;
    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
    const Real hx = mesh.getSpaceSteps().x();
    const Real hy = mesh.getSpaceSteps().y();
    
    __shared__ Real sArray[ 18 ][ 18 ];
    for( int k = 0; k < 18; k++ )
        for( int l = 0; l < 18; l++ )
            sArray[ k ][ l ] = TypeInfo< Real >::getMaxValue();
    __syncthreads();
    /*//filling shared array
    ptr.setsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy );
    __syncthreads();*/
    
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
    {
        typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
        Cell cell( mesh );
        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
        cell.refresh();
        //tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
        for( int k = 0; k < 16; k++ )
        sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
        ptr.setsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy ); //fill edges of sArray
        __syncthreads();
            
        
        if( ! interfaceMap[ j*mesh.getDimensions().x() + i ] ) 
        {
            if( ! interfaceMap( cell ) )
            for( int k = 0; k < 17; k++ )
            {
               ptr.updateCell( aux, cell );
                ptr.updateCell( sArray, thri+1, thrj+1, hx, hy );
                __syncthreads();
            }
        }
        /*for( int k = 0; k < mesh.getDimensions().x(); k++)
            for( int l = 0; l < mesh.getDimensions().y(); l++)
                aux[ k*mesh.getDimensions().x() + l ] = TypeInfo< Real >::getMaxValue();*/
        aux[j*mesh.getDimensions().x() + i] = sArray[thrj+1][thri+1];
        __syncthreads();
    }
    //ptr.getsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy );
}
//#endif
 No newline at end of file
+3 −3
Original line number Diff line number Diff line
@@ -21,7 +21,7 @@ template< typename Real,
          typename Anisotropy >
FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
FastSweepingMethod()
: maxIterations( 2 )
: maxIterations( 1 )
{
   
}
@@ -257,7 +257,7 @@ solve( const MeshPointer& mesh,
          
          tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
          for( int k = 0; k < numBlocksX; k++)
          CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr,
          CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
                                                                                  interfaceMapPtr.template getData< Device >(),
                                                                                  auxPtr.template modifyData< Device>() );
          cudaDeviceSynchronize();
@@ -291,7 +291,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
        cell.refresh();
        //tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
        for( int l = 0; l < 8; l++ )
        for( int l = 0; l < 10; l++ )
        {
            if( ! interfaceMap( cell ) )
            {