Commit 223c69a7 authored by Matouš Fencl's avatar Matouš Fencl
Browse files

Parallel version for CUDA of FSM is implemented.

parent 68f3276e
Loading
Loading
Loading
Loading
+11 −3
Original line number Diff line number Diff line
@@ -101,6 +101,10 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
      __cuda_callable__ void updateCell( MeshFunctionType& u,
                                         const MeshEntity& cell,
                                         const RealType velocity = 1.0);
      
      __cuda_callable__ bool updateCell( volatile Real sArray[10][10][10],
                                         int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz,
                                         const Real velocity = 1.0 );
};

template < typename T1, typename T2 >
@@ -125,10 +129,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
                                      Real *aux,
                                      int *BlockIterDevice);
__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks );

template < typename Real, typename Device, typename Index >
__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );

template < typename Real, typename Device, typename Index >
__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
@@ -142,7 +149,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux );
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
                                      int *BlockIterDevice );
#endif

#include "tnlDirectEikonalMethodsBase_impl.h"
+74 −0
Original line number Diff line number Diff line
@@ -994,4 +994,78 @@ updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
    else
        return false;
}

template< typename Real,
          typename Device,
          typename Index >
__cuda_callable__ 
bool 
tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
        const Real hx, const Real hy, const Real hz, const Real v )
{
   const RealType value = sArray[thrk][thrj][thri];
   //std::cout << value << std::endl;
   RealType a, b, c, tmp = TypeInfo< RealType >::getMaxValue();
   
   c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ],
                        sArray[ thrk-1 ][ thrj ][ thri ] );
    
   b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ],
                        sArray[ thrk ][ thrj-1 ][ thri ] );
   
   a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ],
                        sArray[ thrk ][ thrj ][ thri-1 ] );
   
   
   if( fabs( a ) == TypeInfo< RealType >::getMaxValue() && 
       fabs( b ) == TypeInfo< RealType >::getMaxValue() &&
       fabs( c ) == TypeInfo< RealType >::getMaxValue() )
      return false;
   
    RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
    
    sortMinims( pom );
    
    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
    {
        sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
        tmp = value - sArray[ thrk ][ thrj ][ thri ];
        if ( fabs( tmp ) >  0.01*hx )
            return true;
        else
            return false;
    }
    else
    {
        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
        if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
        {
            sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
            tmp = value - sArray[ thrk ][ thrj ][ thri ];
            if ( fabs( tmp ) > 0.01*hx )
                return true;
            else
                return false;
        }
        else
        {
            tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
                TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
                hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
                hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
            sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
            tmp = value - sArray[ thrk ][ thrj ][ thri ];
            if ( fabs( tmp ) > 0.01*hx )
                return true;
            else
                return false;
        }
    }
    
    return false;
}
#endif
+53 −20
Original line number Diff line number Diff line
@@ -27,7 +27,7 @@ template< typename Real,
          typename Anisotropy >
FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
FastSweepingMethod()
: maxIterations( 1 )
: maxIterations( 100 )
{
   
}
@@ -236,6 +236,13 @@ solve( const MeshPointer& mesh,
      {
         // TODO: CUDA code
#ifdef HAVE_CUDA
          
          Real *dAux;
          cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) );
          
          
          
          
          const int cudaBlockSize( 16 );
          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
@@ -244,7 +251,7 @@ solve( const MeshPointer& mesh,
          
          tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
          
          
          aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 );
          
          //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) );

@@ -263,7 +270,7 @@ solve( const MeshPointer& mesh,
                       
            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
                                                             interfaceMapPtr.template getData< Device >(),
                                                             auxPtr.template modifyData< Device>(),
                                                             dAux,
                                                             BlockIterDevice );
            
            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
@@ -275,14 +282,16 @@ solve( const MeshPointer& mesh,
                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
            
          }
          aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 );
          cudaFree( dAux );
          cudaFree( BlockIterDevice );
          cudaFree( dBlock );
          cudaDeviceSynchronize();
          
          TNL_CHECK_CUDA_DEVICE;
              
          aux = *auxPtr;
          interfaceMap = *interfaceMapPtr;
          //aux = *auxPtr;
          //interfaceMap = *interfaceMapPtr;
#endif
      }
      iteration++;
@@ -291,6 +300,23 @@ solve( const MeshPointer& mesh,
}

#ifdef HAVE_CUDA
template < typename Real, typename Device, typename Index >
__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a )
{
    int i = threadIdx.x + blockDim.x*blockIdx.x;
    int j = blockDim.y*blockIdx.y + threadIdx.y;
    const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 )
    {    
        dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ];
    }
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 )
    {    
        aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ];
    }
    
}

__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks )
{
    int i = threadIdx.x;
@@ -341,16 +367,17 @@ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlock
template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
                                      Real *aux,
                                      int *BlockIterDevice )
{
    int thri = threadIdx.x; int thrj = threadIdx.y;
    int blIdx = blockIdx.x; int blIdy = blockIdx.y;
    int i = thri + blockDim.x*blIdx;
    int j = blockDim.y*blIdy + thrj;
    int currentIndex = thrj * 16 + thri;
    int currentIndex = thrj * blockDim.x + thri;
    
    __shared__ volatile bool changed[256];
    //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
    __shared__ volatile bool changed[16*16];
    changed[ currentIndex ] = false;
    
    if( thrj == 0 && thri == 0 )
@@ -365,6 +392,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
        hy = mesh.getSpaceSteps().y();
    }
    
    //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
    __shared__ volatile Real sArray[18][18];
    sArray[thrj][thri] = TypeInfo< Real >::getMaxValue();
    
@@ -445,18 +473,23 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
        }
        __syncthreads();
        
        /*if( thri == 0 && thrj == 0 && blIdx == 1 && blIdy == 3 ){
            for( int h = 15; h > -1; h-- ){
                for( int g = 0; g < 16; g++ ){
                    printf( "%d\t", changed[h*16+g] );
    //pyramid reduction
        if( blockDim.x*blockDim.y == 1024 )
        {
            if( currentIndex < 512 )
            {
                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
            }
        }
                printf("\n");
        __syncthreads();
        if( blockDim.x*blockDim.y >= 512 )
        {
            if( currentIndex < 256 )
            {
                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
            }
            printf("\n");
        }
        __syncthreads();*/
        
    //pyramid reduction
        __syncthreads();
        if( blockDim.x*blockDim.y >= 256 )
        {
            if( currentIndex < 128 )
+221 −16
Original line number Diff line number Diff line
@@ -255,13 +255,34 @@ solve( const MeshPointer& mesh,
              std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
          dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
          dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
          Devices::Cuda::synchronizeDevice();
                 
          tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
          for( int k = 0; k < numBlocksX; k++)
          
          int *BlockIterDevice;
          int BlockIterD = 1;
          
          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );
          int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
          int *dBlock;
          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
          
          while( BlockIterD )
          {
             CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
                                                              interfaceMapPtr.template getData< Device >(),
                                                                                  auxPtr.template modifyData< Device>() );
                                                              auxPtr.template modifyData< Device>(),
                                                              BlockIterDevice );
            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
            
            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                   
            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
            
          }
          cudaFree( BlockIterDevice );
          cudaFree( dBlock );
          cudaDeviceSynchronize();
          TNL_CHECK_CUDA_DEVICE;
          aux = *auxPtr;
@@ -280,27 +301,211 @@ solve( const MeshPointer& mesh,
template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux )
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
                                      int *BlockIterDevice )
{
    int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
    int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
    int i = threadIdx.x + blockDim.x*blockIdx.x;
    int j = blockDim.y*blockIdx.y + threadIdx.y;
    int k = blockDim.z*blockIdx.z + threadIdx.z;
    int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
    
    __shared__ volatile bool changed[8*8*8];
    changed[ currentIndex ] = false;
    
    if( thrj == 0 && thri == 0 && thrk == 0 )
        changed[ 0 ] = true;
    
    const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
    __shared__ Real hx;
    __shared__ Real hy;
    __shared__ Real hz;
    if( thrj == 1 && thri == 1 && thrk == 1 )
    {
        hx = mesh.getSpaceSteps().x();
        hy = mesh.getSpaceSteps().y();
        hz = mesh.getSpaceSteps().z();
    }
    __shared__ volatile Real sArray[10][10][10];
    sArray[thrk][thrj][thri] = TypeInfo< Real >::getMaxValue();
    if(thri == 0 )
    {
        sArray[8][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue();
        sArray[9][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue();
        sArray[thrk+1][thrj+1][8] = TypeInfo< Real >::getMaxValue();
        sArray[thrk+1][thrj+1][9] = TypeInfo< Real >::getMaxValue();
        sArray[thrj+1][8][thrk+1] = TypeInfo< Real >::getMaxValue();
        sArray[thrj+1][9][thrk+1] = TypeInfo< Real >::getMaxValue();
    }
            
    //filling sArray edges
    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
    int dimZ = mesh.getDimensions().z();
    __shared__ volatile int numOfBlockx;
    __shared__ volatile int numOfBlocky;
    __shared__ volatile int numOfBlockz;
    __shared__ int xkolik;
    __shared__ int ykolik;
    __shared__ int zkolik;
    if( thri == 0 && thrj == 0 && thrk == 0 )
    {
        xkolik = blockDim.x + 1;
        ykolik = blockDim.y + 1;
        zkolik = blockDim.z + 1;
        numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
        numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
        numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0);
        
        if( numOfBlockx - 1 == blIdx )
            xkolik = dimX - (blIdx)*blockDim.x+1;

        if( numOfBlocky -1 == blIdy )
            ykolik = dimY - (blIdy)*blockDim.y+1;
        if( numOfBlockz-1 == blIdz )
            zkolik = dimZ - (blIdz)*blockDim.z+1;
        
        BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0;
    }
    __syncthreads();
    
    if( thri == 0 )
    {        
        if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
            sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
        else
            sArray[thrk+1][thrj+1][0] = TypeInfo< Real >::getMaxValue();
    }
    
    if( thri == 1 )
    {
        if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
            sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
        else
            sArray[thrk+1][thrj+1][9] = TypeInfo< Real >::getMaxValue();
    }
    if( thri == 2 )
    {        
        if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
            sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
        else
            sArray[thrk+1][0][thrj+1] = TypeInfo< Real >::getMaxValue();
    }
    
    if( thri == 3 )
    {
        if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
            sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
        else
            sArray[thrk+1][9][thrj+1] = TypeInfo< Real >::getMaxValue();
    }
    if( thri == 4 )
    {        
        if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
            sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
        else
            sArray[0][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue();
    }
    
    if( thri == 5 )
    {
        if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
            sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
        else
            sArray[9][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue();
    }
    
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
    {
        typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
        Cell cell( mesh );
        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
        cell.refresh();
        //tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
        for( int l = 0; l < 10; l++ )
        sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
    }
    __shared__ volatile int loopcounter;
    loopcounter = 0;
    __syncthreads(); 
    while( changed[ 0 ] )
    {
            if( ! interfaceMap( cell ) )
        __syncthreads();
        
        changed[ currentIndex ] = false;
        
    //calculation of update cell
        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ )
        {
            if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] )
            {
                changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
            }
        }
        __syncthreads();
        
    //pyramid reduction
        if( blockDim.x*blockDim.y*blockDim.z == 1024 )
        {
            if( currentIndex < 512 )
            {
                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
            }
        }
        __syncthreads();
        if( blockDim.x*blockDim.y*blockDim.z >= 512 )
        {
            if( currentIndex < 256 )
            {
                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
            }
        }
        __syncthreads();
        if( blockDim.x*blockDim.y*blockDim.z >= 256 )
        {
            if( currentIndex < 128 )
            {
                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
            }
        }
        __syncthreads();
        if( blockDim.x*blockDim.y*blockDim.z >= 128 )
        {
            if( currentIndex < 64 )
            {
                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
            }
        }
        __syncthreads();
        if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
        {
            if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
            if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
            if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
            if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
            if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
            if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
        }
        __syncthreads();
        
        /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
        {
               ptr.updateCell( aux, cell );
            for(int m = 0; m < 8; m++){
                for(int n = 0; n<8; n++){
                    for(int b=0; b<8; b++)
                        printf(" %i ", changed[m*64 + n*8 + b]);
                    printf("\n");
                }
                printf("\n \n");
            }
        }*/
        if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
        {
            //loopcounter++;
            BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
        }
        __syncthreads();
        /*if(thri == 0 && thrj==0 && thrk==0)
            printf("%i \n",loopcounter);
        if(loopcounter == 500)
            break;*/
    }
  
    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) )
        aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
}   
#endif