Parallel version for CUDA of FSM is implemented. (223c69a7) · Commits · TNL / tnl-dev

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h

+11 −3

Original line number	Diff line number	Diff line
		@@ -101,6 +101,10 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
		__cuda_callable__ void updateCell( MeshFunctionType& u,
		const MeshEntity& cell,
		const RealType velocity = 1.0);

		__cuda_callable__ bool updateCell( volatile Real sArray[10][10][10],
		int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz,
		const Real velocity = 1.0 );
		};

		template < typename T1, typename T2 >
		@@ -125,10 +129,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
		Real *aux,
		int *BlockIterDevice);
		__global__ void CudaParallelReduc( int BlockIterDevice, int dBlock, int nBlocks );

		template < typename Real, typename Device, typename Index >
		__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );

		template < typename Real, typename Device, typename Index >
		__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
		@@ -142,7 +149,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
		template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux );
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
		int *BlockIterDevice );
		#endif

		#include "tnlDirectEikonalMethodsBase_impl.h"

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h

+74 −0

Original line number	Diff line number	Diff line
		@@ -994,4 +994,78 @@ updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
		else
		return false;
		}

		template< typename Real,
		typename Device,
		typename Index >
		__cuda_callable__
		bool
		tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
		updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
		const Real hx, const Real hy, const Real hz, const Real v )
		{
		const RealType value = sArray[thrk][thrj][thri];
		//std::cout << value << std::endl;
		RealType a, b, c, tmp = TypeInfo< RealType >::getMaxValue();

		c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ],
		sArray[ thrk-1 ][ thrj ][ thri ] );

		b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ],
		sArray[ thrk ][ thrj-1 ][ thri ] );

		a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ],
		sArray[ thrk ][ thrj ][ thri-1 ] );


		if( fabs( a ) == TypeInfo< RealType >::getMaxValue() &&
		fabs( b ) == TypeInfo< RealType >::getMaxValue() &&
		fabs( c ) == TypeInfo< RealType >::getMaxValue() )
		return false;

		RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};

		sortMinims( pom );

		tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
		if( fabs( tmp ) < fabs( pom[ 1 ] ) )
		{
		sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
		tmp = value - sArray[ thrk ][ thrj ][ thri ];
		if ( fabs( tmp ) > 0.01*hx )
		return true;
		else
		return false;
		}
		else
		{
		tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] +
		TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) -
		( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
		if( fabs( tmp ) < fabs( pom[ 2 ]) )
		{
		sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
		tmp = value - sArray[ thrk ][ thrj ][ thri ];
		if ( fabs( tmp ) > 0.01*hx )
		return true;
		else
		return false;
		}
		else
		{
		tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
		TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) -
		hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
		hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
		sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
		tmp = value - sArray[ thrk ][ thrj ][ thri ];
		if ( fabs( tmp ) > 0.01*hx )
		return true;
		else
		return false;
		}
		}

		return false;
		}
		#endif

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h

+53 −20

Original line number	Diff line number	Diff line
		@@ -27,7 +27,7 @@ template< typename Real,
		typename Anisotropy >
		FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
		FastSweepingMethod()
		: maxIterations( 1 )
		: maxIterations( 100 )
		{

		}
		@@ -236,6 +236,13 @@ solve( const MeshPointer& mesh,
		{
		// TODO: CUDA code
		#ifdef HAVE_CUDA

		Real *dAux;
		cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) );




		const int cudaBlockSize( 16 );
		int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
		int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
		@@ -244,7 +251,7 @@ solve( const MeshPointer& mesh,

		tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;


		aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 );

		//int BlockIter = 1;// = (bool)malloc( ( numBlocksX numBlocksY ) * sizeof( bool ) );

		@@ -263,7 +270,7 @@ solve( const MeshPointer& mesh,

		CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
		interfaceMapPtr.template getData< Device >(),
		auxPtr.template modifyData< Device>(),
		dAux,
		BlockIterDevice );

		CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
		@@ -275,14 +282,16 @@ solve( const MeshPointer& mesh,
		BlockIter[ 0 ] = BlockIter[ 0 ] \|\| BlockIter[ i ];*/

		}
		aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 );
		cudaFree( dAux );
		cudaFree( BlockIterDevice );
		cudaFree( dBlock );
		cudaDeviceSynchronize();

		TNL_CHECK_CUDA_DEVICE;

		aux = *auxPtr;
		interfaceMap = *interfaceMapPtr;
		//aux = *auxPtr;
		//interfaceMap = *interfaceMapPtr;
		#endif
		}
		iteration++;
		@@ -291,6 +300,23 @@ solve( const MeshPointer& mesh,
		}

		#ifdef HAVE_CUDA
		template < typename Real, typename Device, typename Index >
		__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a )
		{
		int i = threadIdx.x + blockDim.x*blockIdx.x;
		int j = blockDim.y*blockIdx.y + threadIdx.y;
		const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 )
		{
		dAux[ jmesh.getDimensions().x() + i ] = aux[ jmesh.getDimensions().x() + i ];
		}
		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 )
		{
		aux[ jmesh.getDimensions().x() + i ] = dAux[ jmesh.getDimensions().x() + i ];
		}

		}

		__global__ void CudaParallelReduc( int BlockIterDevice, int dBlock, int nBlocks )
		{
		int i = threadIdx.x;
		@@ -341,16 +367,17 @@ __global__ void CudaParallelReduc( int BlockIterDevice, int dBlock, int nBlock
		template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
		Real *aux,
		int *BlockIterDevice )
		{
		int thri = threadIdx.x; int thrj = threadIdx.y;
		int blIdx = blockIdx.x; int blIdy = blockIdx.y;
		int i = thri + blockDim.x*blIdx;
		int j = blockDim.y*blIdy + thrj;
		int currentIndex = thrj * 16 + thri;
		int currentIndex = thrj * blockDim.x + thri;

		__shared__ volatile bool changed[256];
		//__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
		__shared__ volatile bool changed[16*16];
		changed[ currentIndex ] = false;

		if( thrj == 0 && thri == 0 )
		@@ -365,6 +392,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		hy = mesh.getSpaceSteps().y();
		}

		//__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
		__shared__ volatile Real sArray[18][18];
		sArray[thrj][thri] = TypeInfo< Real >::getMaxValue();

		@@ -445,18 +473,23 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		}
		__syncthreads();

		/*if( thri == 0 && thrj == 0 && blIdx == 1 && blIdy == 3 ){
		for( int h = 15; h > -1; h-- ){
		for( int g = 0; g < 16; g++ ){
		printf( "%d\t", changed[h*16+g] );
		//pyramid reduction
		if( blockDim.x*blockDim.y == 1024 )
		{
		if( currentIndex < 512 )
		{
		changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 512 ];
		}
		}
		printf("\n");
		__syncthreads();
		if( blockDim.x*blockDim.y >= 512 )
		{
		if( currentIndex < 256 )
		{
		changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 256 ];
		}
		printf("\n");
		}
		__syncthreads();*/

		//pyramid reduction
		__syncthreads();
		if( blockDim.x*blockDim.y >= 256 )
		{
		if( currentIndex < 128 )

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h

+221 −16

Original line number	Diff line number	Diff line
		@@ -255,13 +255,34 @@ solve( const MeshPointer& mesh,
		std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
		dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
		dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
		Devices::Cuda::synchronizeDevice();

		tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
		for( int k = 0; k < numBlocksX; k++)

		int *BlockIterDevice;
		int BlockIterD = 1;

		cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );
		int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
		int *dBlock;
		cudaMalloc(&dBlock, nBlocks * sizeof( int ) );

		while( BlockIterD )
		{
		CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
		interfaceMapPtr.template getData< Device >(),
		auxPtr.template modifyData< Device>() );
		auxPtr.template modifyData< Device>(),
		BlockIterDevice );
		CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
		CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );

		cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);

		/for( int i = 1; i < numBlocksX numBlocksY; i++ )
		BlockIter[ 0 ] = BlockIter[ 0 ] \|\| BlockIter[ i ];*/

		}
		cudaFree( BlockIterDevice );
		cudaFree( dBlock );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		aux = *auxPtr;
		@@ -280,27 +301,211 @@ solve( const MeshPointer& mesh,
		template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux )
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
		int *BlockIterDevice )
		{
		int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
		int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
		int i = threadIdx.x + blockDim.x*blockIdx.x;
		int j = blockDim.y*blockIdx.y + threadIdx.y;
		int k = blockDim.z*blockIdx.z + threadIdx.z;
		int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;

		__shared__ volatile bool changed[888];
		changed[ currentIndex ] = false;

		if( thrj == 0 && thri == 0 && thrk == 0 )
		changed[ 0 ] = true;

		const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
		__shared__ Real hx;
		__shared__ Real hy;
		__shared__ Real hz;
		if( thrj == 1 && thri == 1 && thrk == 1 )
		{
		hx = mesh.getSpaceSteps().x();
		hy = mesh.getSpaceSteps().y();
		hz = mesh.getSpaceSteps().z();
		}
		__shared__ volatile Real sArray[10][10][10];
		sArray[thrk][thrj][thri] = TypeInfo< Real >::getMaxValue();
		if(thri == 0 )
		{
		sArray[8][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue();
		sArray[9][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue();
		sArray[thrk+1][thrj+1][8] = TypeInfo< Real >::getMaxValue();
		sArray[thrk+1][thrj+1][9] = TypeInfo< Real >::getMaxValue();
		sArray[thrj+1][8][thrk+1] = TypeInfo< Real >::getMaxValue();
		sArray[thrj+1][9][thrk+1] = TypeInfo< Real >::getMaxValue();
		}

		//filling sArray edges
		int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
		int dimZ = mesh.getDimensions().z();
		__shared__ volatile int numOfBlockx;
		__shared__ volatile int numOfBlocky;
		__shared__ volatile int numOfBlockz;
		__shared__ int xkolik;
		__shared__ int ykolik;
		__shared__ int zkolik;
		if( thri == 0 && thrj == 0 && thrk == 0 )
		{
		xkolik = blockDim.x + 1;
		ykolik = blockDim.y + 1;
		zkolik = blockDim.z + 1;
		numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
		numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
		numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0);

		if( numOfBlockx - 1 == blIdx )
		xkolik = dimX - (blIdx)*blockDim.x+1;

		if( numOfBlocky -1 == blIdy )
		ykolik = dimY - (blIdy)*blockDim.y+1;
		if( numOfBlockz-1 == blIdz )
		zkolik = dimZ - (blIdz)*blockDim.z+1;

		BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0;
		}
		__syncthreads();

		if( thri == 0 )
		{
		if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
		sArray[thrk+1][thrj+1][0] = aux[ blIdzblockDim.z dimX * dimY + blIdy * blockDim.ydimX + blIdxblockDim.x + thrj * dimX -1 + thrkdimXdimY ];
		else
		sArray[thrk+1][thrj+1][0] = TypeInfo< Real >::getMaxValue();
		}

		if( thri == 1 )
		{
		if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
		sArray[thrk+1][thrj+1][9] = aux[ blIdzblockDim.z dimX * dimY + blIdy blockDim.ydimX+ blIdxblockDim.x + blockDim.x + thrj dimX + thrkdimXdimY ];
		else
		sArray[thrk+1][thrj+1][9] = TypeInfo< Real >::getMaxValue();
		}
		if( thri == 2 )
		{
		if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
		sArray[thrk+1][0][thrj+1] = aux[ blIdzblockDim.z dimX * dimY + blIdy * blockDim.ydimX + blIdxblockDim.x - dimX + thrj + thrkdimXdimY ];
		else
		sArray[thrk+1][0][thrj+1] = TypeInfo< Real >::getMaxValue();
		}

		if( thri == 3 )
		{
		if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
		sArray[thrk+1][9][thrj+1] = aux[ blIdzblockDim.z dimX * dimY + (blIdy+1) * blockDim.ydimX + blIdxblockDim.x + thrj + thrkdimXdimY ];
		else
		sArray[thrk+1][9][thrj+1] = TypeInfo< Real >::getMaxValue();
		}
		if( thri == 4 )
		{
		if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
		sArray[0][thrj+1][thrk+1] = aux[ blIdzblockDim.z dimX * dimY + blIdy * blockDim.ydimX + blIdxblockDim.x - dimX * dimY + thrj * dimX + thrk ];
		else
		sArray[0][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue();
		}

		if( thri == 5 )
		{
		if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
		sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)blockDim.z dimX * dimY + blIdy * blockDim.ydimX + blIdxblockDim.x + thrj * dimX + thrk ];
		else
		sArray[9][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue();
		}

		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
		{
		typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
		Cell cell( mesh );
		cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
		cell.refresh();
		//tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
		for( int l = 0; l < 10; l++ )
		sArray[thrk+1][thrj+1][thri+1] = aux[ kdimXdimY + j*dimX + i ];
		}
		__shared__ volatile int loopcounter;
		loopcounter = 0;
		__syncthreads();
		while( changed[ 0 ] )
		{
		if( ! interfaceMap( cell ) )
		__syncthreads();

		changed[ currentIndex ] = false;

		//calculation of update cell
		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ )
		{
		if( ! interfaceMap[ kdimXdimY + j * mesh.getDimensions().x() + i ] )
		{
		changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
		}
		}
		__syncthreads();

		//pyramid reduction
		if( blockDim.xblockDim.yblockDim.z == 1024 )
		{
		if( currentIndex < 512 )
		{
		changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 512 ];
		}
		}
		__syncthreads();
		if( blockDim.xblockDim.yblockDim.z >= 512 )
		{
		if( currentIndex < 256 )
		{
		changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 256 ];
		}
		}
		__syncthreads();
		if( blockDim.xblockDim.yblockDim.z >= 256 )
		{
		if( currentIndex < 128 )
		{
		changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 128 ];
		}
		}
		__syncthreads();
		if( blockDim.xblockDim.yblockDim.z >= 128 )
		{
		if( currentIndex < 64 )
		{
		changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 64 ];
		}
		}
		__syncthreads();
		if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
		{
		if( true ) changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 32 ];
		if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 16 ];
		if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 8 ];
		if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 4 ];
		if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 2 ];
		if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 1 ];
		}
		__syncthreads();

		/*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
		{
		ptr.updateCell( aux, cell );
		for(int m = 0; m < 8; m++){
		for(int n = 0; n<8; n++){
		for(int b=0; b<8; b++)
		printf(" %i ", changed[m64 + n8 + b]);
		printf("\n");
		}
		printf("\n \n");
		}
		}*/
		if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
		{
		//loopcounter++;
		BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
		}
		__syncthreads();
		/*if(thri == 0 && thrj==0 && thrk==0)
		printf("%i \n",loopcounter);
		if(loopcounter == 500)
		break;*/
		}

		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ kdimXdimY+j * mesh.getDimensions().x() + i ]) )
		aux[ kdimXdimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
		}
		#endif