Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +11 −3 Original line number Diff line number Diff line Loading @@ -101,6 +101,10 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > __cuda_callable__ void updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0); __cuda_callable__ bool updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz, const Real velocity = 1.0 ); }; template < typename T1, typename T2 > Loading @@ -125,10 +129,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *aux, int *BlockIterDevice); __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ); template < typename Real, typename Device, typename Index > __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, Loading @@ -142,7 +149,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3 template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux ); Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, int *BlockIterDevice ); #endif #include "tnlDirectEikonalMethodsBase_impl.h" src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +74 −0 Original line number Diff line number Diff line Loading @@ -994,4 +994,78 @@ updateCell( volatile Real sArray[18], int thri, const Real h, const Real v ) else return false; } template< typename Real, typename Device, typename Index > __cuda_callable__ bool tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz, const Real v ) { const RealType value = sArray[thrk][thrj][thri]; //std::cout << value << std::endl; RealType a, b, c, tmp = TypeInfo< RealType >::getMaxValue(); c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ], sArray[ thrk-1 ][ thrj ][ thri ] ); b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ], sArray[ thrk ][ thrj-1 ][ thri ] ); a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ], sArray[ thrk ][ thrj ][ thri-1 ] ); if( fabs( a ) == TypeInfo< RealType >::getMaxValue() && fabs( b ) == TypeInfo< RealType >::getMaxValue() && fabs( c ) == TypeInfo< RealType >::getMaxValue() ) return false; RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; sortMinims( pom ); tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; if( fabs( tmp ) < fabs( pom[ 1 ] ) ) { sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); tmp = value - sArray[ thrk ][ thrj ][ thri ]; if ( fabs( tmp ) > 0.01*hx ) return true; else return false; } else { tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); if( fabs( tmp ) < fabs( pom[ 2 ]) ) { sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); tmp = value - sArray[ thrk ][ thrj ][ thri ]; if ( fabs( tmp ) > 0.01*hx ) return true; else return false; } else { tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); tmp = value - sArray[ thrk ][ thrj ][ thri ]; if ( fabs( tmp ) > 0.01*hx ) return true; else return false; } } return false; } #endif src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +53 −20 Original line number Diff line number Diff line Loading @@ -27,7 +27,7 @@ template< typename Real, typename Anisotropy > FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: FastSweepingMethod() : maxIterations( 1 ) : maxIterations( 100 ) { } Loading Loading @@ -236,6 +236,13 @@ solve( const MeshPointer& mesh, { // TODO: CUDA code #ifdef HAVE_CUDA Real *dAux; cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) ); const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); Loading @@ -244,7 +251,7 @@ solve( const MeshPointer& mesh, tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 ); //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); Loading @@ -263,7 +270,7 @@ solve( const MeshPointer& mesh, CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), dAux, BlockIterDevice ); CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); Loading @@ -275,14 +282,16 @@ solve( const MeshPointer& mesh, BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 ); cudaFree( dAux ); cudaFree( BlockIterDevice ); cudaFree( dBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; interfaceMap = *interfaceMapPtr; //aux = *auxPtr; //interfaceMap = *interfaceMapPtr; #endif } iteration++; Loading @@ -291,6 +300,23 @@ solve( const MeshPointer& mesh, } #ifdef HAVE_CUDA template < typename Real, typename Device, typename Index > __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a ) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 ) { dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ]; } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 ) { aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ]; } } __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ) { int i = threadIdx.x; Loading Loading @@ -341,16 +367,17 @@ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlock template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *aux, int *BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int i = thri + blockDim.x*blIdx; int j = blockDim.y*blIdy + thrj; int currentIndex = thrj * 16 + thri; int currentIndex = thrj * blockDim.x + thri; __shared__ volatile bool changed[256]; //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; __shared__ volatile bool changed[16*16]; changed[ currentIndex ] = false; if( thrj == 0 && thri == 0 ) Loading @@ -365,6 +392,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< hy = mesh.getSpaceSteps().y(); } //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ]; __shared__ volatile Real sArray[18][18]; sArray[thrj][thri] = TypeInfo< Real >::getMaxValue(); Loading Loading @@ -445,18 +473,23 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< } __syncthreads(); /*if( thri == 0 && thrj == 0 && blIdx == 1 && blIdy == 3 ){ for( int h = 15; h > -1; h-- ){ for( int g = 0; g < 16; g++ ){ printf( "%d\t", changed[h*16+g] ); //pyramid reduction if( blockDim.x*blockDim.y == 1024 ) { if( currentIndex < 512 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; } } printf("\n"); __syncthreads(); if( blockDim.x*blockDim.y >= 512 ) { if( currentIndex < 256 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; } printf("\n"); } __syncthreads();*/ //pyramid reduction __syncthreads(); if( blockDim.x*blockDim.y >= 256 ) { if( currentIndex < 128 ) Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +221 −16 Original line number Diff line number Diff line Loading @@ -255,13 +255,34 @@ solve( const MeshPointer& mesh, std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); Devices::Cuda::synchronizeDevice(); tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; for( int k = 0; k < numBlocksX; k++) int *BlockIterDevice; int BlockIterD = 1; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) ); int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); while( BlockIterD ) { CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>() ); auxPtr.template modifyData< Device>(), BlockIterDevice ); CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } cudaFree( BlockIterDevice ); cudaFree( dBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; Loading @@ -280,27 +301,211 @@ solve( const MeshPointer& mesh, template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux ) Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, int *BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z; int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; int k = blockDim.z*blockIdx.z + threadIdx.z; int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; __shared__ volatile bool changed[8*8*8]; changed[ currentIndex ] = false; if( thrj == 0 && thri == 0 && thrk == 0 ) changed[ 0 ] = true; const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); __shared__ Real hx; __shared__ Real hy; __shared__ Real hz; if( thrj == 1 && thri == 1 && thrk == 1 ) { hx = mesh.getSpaceSteps().x(); hy = mesh.getSpaceSteps().y(); hz = mesh.getSpaceSteps().z(); } __shared__ volatile Real sArray[10][10][10]; sArray[thrk][thrj][thri] = TypeInfo< Real >::getMaxValue(); if(thri == 0 ) { sArray[8][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue(); sArray[9][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue(); sArray[thrk+1][thrj+1][8] = TypeInfo< Real >::getMaxValue(); sArray[thrk+1][thrj+1][9] = TypeInfo< Real >::getMaxValue(); sArray[thrj+1][8][thrk+1] = TypeInfo< Real >::getMaxValue(); sArray[thrj+1][9][thrk+1] = TypeInfo< Real >::getMaxValue(); } //filling sArray edges int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); int dimZ = mesh.getDimensions().z(); __shared__ volatile int numOfBlockx; __shared__ volatile int numOfBlocky; __shared__ volatile int numOfBlockz; __shared__ int xkolik; __shared__ int ykolik; __shared__ int zkolik; if( thri == 0 && thrj == 0 && thrk == 0 ) { xkolik = blockDim.x + 1; ykolik = blockDim.y + 1; zkolik = blockDim.z + 1; numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0); if( numOfBlockx - 1 == blIdx ) xkolik = dimX - (blIdx)*blockDim.x+1; if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*blockDim.y+1; if( numOfBlockz-1 == blIdz ) zkolik = dimZ - (blIdz)*blockDim.z+1; BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0; } __syncthreads(); if( thri == 0 ) { if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik ) sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ]; else sArray[thrk+1][thrj+1][0] = TypeInfo< Real >::getMaxValue(); } if( thri == 1 ) { if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik ) sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ]; else sArray[thrk+1][thrj+1][9] = TypeInfo< Real >::getMaxValue(); } if( thri == 2 ) { if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik ) sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ]; else sArray[thrk+1][0][thrj+1] = TypeInfo< Real >::getMaxValue(); } if( thri == 3 ) { if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik ) sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ]; else sArray[thrk+1][9][thrj+1] = TypeInfo< Real >::getMaxValue(); } if( thri == 4 ) { if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik ) sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ]; else sArray[0][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue(); } if( thri == 5 ) { if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik ) sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ]; else sArray[9][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue(); } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) { typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; Cell cell( mesh ); cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; cell.refresh(); //tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; for( int l = 0; l < 10; l++ ) sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; } __shared__ volatile int loopcounter; loopcounter = 0; __syncthreads(); while( changed[ 0 ] ) { if( ! interfaceMap( cell ) ) __syncthreads(); changed[ currentIndex ] = false; //calculation of update cell if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ ) { if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] ) { changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); } } __syncthreads(); //pyramid reduction if( blockDim.x*blockDim.y*blockDim.z == 1024 ) { if( currentIndex < 512 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; } } __syncthreads(); if( blockDim.x*blockDim.y*blockDim.z >= 512 ) { if( currentIndex < 256 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; } } __syncthreads(); if( blockDim.x*blockDim.y*blockDim.z >= 256 ) { if( currentIndex < 128 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; } } __syncthreads(); if( blockDim.x*blockDim.y*blockDim.z >= 128 ) { if( currentIndex < 64 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; } } __syncthreads(); if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU { if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; } __syncthreads(); /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0) { ptr.updateCell( aux, cell ); for(int m = 0; m < 8; m++){ for(int n = 0; n<8; n++){ for(int b=0; b<8; b++) printf(" %i ", changed[m*64 + n*8 + b]); printf("\n"); } printf("\n \n"); } }*/ if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) { //loopcounter++; BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1; } __syncthreads(); /*if(thri == 0 && thrj==0 && thrk==0) printf("%i \n",loopcounter); if(loopcounter == 500) break;*/ } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) ) aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ]; } #endif Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +11 −3 Original line number Diff line number Diff line Loading @@ -101,6 +101,10 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > __cuda_callable__ void updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0); __cuda_callable__ bool updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz, const Real velocity = 1.0 ); }; template < typename T1, typename T2 > Loading @@ -125,10 +129,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *aux, int *BlockIterDevice); __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ); template < typename Real, typename Device, typename Index > __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, Loading @@ -142,7 +149,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3 template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux ); Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, int *BlockIterDevice ); #endif #include "tnlDirectEikonalMethodsBase_impl.h"
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +74 −0 Original line number Diff line number Diff line Loading @@ -994,4 +994,78 @@ updateCell( volatile Real sArray[18], int thri, const Real h, const Real v ) else return false; } template< typename Real, typename Device, typename Index > __cuda_callable__ bool tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz, const Real v ) { const RealType value = sArray[thrk][thrj][thri]; //std::cout << value << std::endl; RealType a, b, c, tmp = TypeInfo< RealType >::getMaxValue(); c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ], sArray[ thrk-1 ][ thrj ][ thri ] ); b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ], sArray[ thrk ][ thrj-1 ][ thri ] ); a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ], sArray[ thrk ][ thrj ][ thri-1 ] ); if( fabs( a ) == TypeInfo< RealType >::getMaxValue() && fabs( b ) == TypeInfo< RealType >::getMaxValue() && fabs( c ) == TypeInfo< RealType >::getMaxValue() ) return false; RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; sortMinims( pom ); tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; if( fabs( tmp ) < fabs( pom[ 1 ] ) ) { sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); tmp = value - sArray[ thrk ][ thrj ][ thri ]; if ( fabs( tmp ) > 0.01*hx ) return true; else return false; } else { tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); if( fabs( tmp ) < fabs( pom[ 2 ]) ) { sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); tmp = value - sArray[ thrk ][ thrj ][ thri ]; if ( fabs( tmp ) > 0.01*hx ) return true; else return false; } else { tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); tmp = value - sArray[ thrk ][ thrj ][ thri ]; if ( fabs( tmp ) > 0.01*hx ) return true; else return false; } } return false; } #endif
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +53 −20 Original line number Diff line number Diff line Loading @@ -27,7 +27,7 @@ template< typename Real, typename Anisotropy > FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: FastSweepingMethod() : maxIterations( 1 ) : maxIterations( 100 ) { } Loading Loading @@ -236,6 +236,13 @@ solve( const MeshPointer& mesh, { // TODO: CUDA code #ifdef HAVE_CUDA Real *dAux; cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) ); const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); Loading @@ -244,7 +251,7 @@ solve( const MeshPointer& mesh, tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 ); //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); Loading @@ -263,7 +270,7 @@ solve( const MeshPointer& mesh, CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), dAux, BlockIterDevice ); CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); Loading @@ -275,14 +282,16 @@ solve( const MeshPointer& mesh, BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 ); cudaFree( dAux ); cudaFree( BlockIterDevice ); cudaFree( dBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; interfaceMap = *interfaceMapPtr; //aux = *auxPtr; //interfaceMap = *interfaceMapPtr; #endif } iteration++; Loading @@ -291,6 +300,23 @@ solve( const MeshPointer& mesh, } #ifdef HAVE_CUDA template < typename Real, typename Device, typename Index > __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a ) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 ) { dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ]; } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 ) { aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ]; } } __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ) { int i = threadIdx.x; Loading Loading @@ -341,16 +367,17 @@ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlock template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *aux, int *BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int i = thri + blockDim.x*blIdx; int j = blockDim.y*blIdy + thrj; int currentIndex = thrj * 16 + thri; int currentIndex = thrj * blockDim.x + thri; __shared__ volatile bool changed[256]; //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; __shared__ volatile bool changed[16*16]; changed[ currentIndex ] = false; if( thrj == 0 && thri == 0 ) Loading @@ -365,6 +392,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< hy = mesh.getSpaceSteps().y(); } //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ]; __shared__ volatile Real sArray[18][18]; sArray[thrj][thri] = TypeInfo< Real >::getMaxValue(); Loading Loading @@ -445,18 +473,23 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< } __syncthreads(); /*if( thri == 0 && thrj == 0 && blIdx == 1 && blIdy == 3 ){ for( int h = 15; h > -1; h-- ){ for( int g = 0; g < 16; g++ ){ printf( "%d\t", changed[h*16+g] ); //pyramid reduction if( blockDim.x*blockDim.y == 1024 ) { if( currentIndex < 512 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; } } printf("\n"); __syncthreads(); if( blockDim.x*blockDim.y >= 512 ) { if( currentIndex < 256 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; } printf("\n"); } __syncthreads();*/ //pyramid reduction __syncthreads(); if( blockDim.x*blockDim.y >= 256 ) { if( currentIndex < 128 ) Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +221 −16 Original line number Diff line number Diff line Loading @@ -255,13 +255,34 @@ solve( const MeshPointer& mesh, std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); Devices::Cuda::synchronizeDevice(); tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; for( int k = 0; k < numBlocksX; k++) int *BlockIterDevice; int BlockIterD = 1; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) ); int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); while( BlockIterD ) { CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>() ); auxPtr.template modifyData< Device>(), BlockIterDevice ); CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } cudaFree( BlockIterDevice ); cudaFree( dBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; Loading @@ -280,27 +301,211 @@ solve( const MeshPointer& mesh, template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux ) Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, int *BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z; int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; int k = blockDim.z*blockIdx.z + threadIdx.z; int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; __shared__ volatile bool changed[8*8*8]; changed[ currentIndex ] = false; if( thrj == 0 && thri == 0 && thrk == 0 ) changed[ 0 ] = true; const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); __shared__ Real hx; __shared__ Real hy; __shared__ Real hz; if( thrj == 1 && thri == 1 && thrk == 1 ) { hx = mesh.getSpaceSteps().x(); hy = mesh.getSpaceSteps().y(); hz = mesh.getSpaceSteps().z(); } __shared__ volatile Real sArray[10][10][10]; sArray[thrk][thrj][thri] = TypeInfo< Real >::getMaxValue(); if(thri == 0 ) { sArray[8][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue(); sArray[9][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue(); sArray[thrk+1][thrj+1][8] = TypeInfo< Real >::getMaxValue(); sArray[thrk+1][thrj+1][9] = TypeInfo< Real >::getMaxValue(); sArray[thrj+1][8][thrk+1] = TypeInfo< Real >::getMaxValue(); sArray[thrj+1][9][thrk+1] = TypeInfo< Real >::getMaxValue(); } //filling sArray edges int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); int dimZ = mesh.getDimensions().z(); __shared__ volatile int numOfBlockx; __shared__ volatile int numOfBlocky; __shared__ volatile int numOfBlockz; __shared__ int xkolik; __shared__ int ykolik; __shared__ int zkolik; if( thri == 0 && thrj == 0 && thrk == 0 ) { xkolik = blockDim.x + 1; ykolik = blockDim.y + 1; zkolik = blockDim.z + 1; numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0); if( numOfBlockx - 1 == blIdx ) xkolik = dimX - (blIdx)*blockDim.x+1; if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*blockDim.y+1; if( numOfBlockz-1 == blIdz ) zkolik = dimZ - (blIdz)*blockDim.z+1; BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0; } __syncthreads(); if( thri == 0 ) { if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik ) sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ]; else sArray[thrk+1][thrj+1][0] = TypeInfo< Real >::getMaxValue(); } if( thri == 1 ) { if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik ) sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ]; else sArray[thrk+1][thrj+1][9] = TypeInfo< Real >::getMaxValue(); } if( thri == 2 ) { if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik ) sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ]; else sArray[thrk+1][0][thrj+1] = TypeInfo< Real >::getMaxValue(); } if( thri == 3 ) { if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik ) sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ]; else sArray[thrk+1][9][thrj+1] = TypeInfo< Real >::getMaxValue(); } if( thri == 4 ) { if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik ) sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ]; else sArray[0][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue(); } if( thri == 5 ) { if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik ) sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ]; else sArray[9][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue(); } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) { typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; Cell cell( mesh ); cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; cell.refresh(); //tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; for( int l = 0; l < 10; l++ ) sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; } __shared__ volatile int loopcounter; loopcounter = 0; __syncthreads(); while( changed[ 0 ] ) { if( ! interfaceMap( cell ) ) __syncthreads(); changed[ currentIndex ] = false; //calculation of update cell if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ ) { if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] ) { changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); } } __syncthreads(); //pyramid reduction if( blockDim.x*blockDim.y*blockDim.z == 1024 ) { if( currentIndex < 512 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; } } __syncthreads(); if( blockDim.x*blockDim.y*blockDim.z >= 512 ) { if( currentIndex < 256 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; } } __syncthreads(); if( blockDim.x*blockDim.y*blockDim.z >= 256 ) { if( currentIndex < 128 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; } } __syncthreads(); if( blockDim.x*blockDim.y*blockDim.z >= 128 ) { if( currentIndex < 64 ) { changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; } } __syncthreads(); if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU { if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; } __syncthreads(); /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0) { ptr.updateCell( aux, cell ); for(int m = 0; m < 8; m++){ for(int n = 0; n<8; n++){ for(int b=0; b<8; b++) printf(" %i ", changed[m*64 + n*8 + b]); printf("\n"); } printf("\n \n"); } }*/ if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) { //loopcounter++; BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1; } __syncthreads(); /*if(thri == 0 && thrj==0 && thrk==0) printf("%i \n",loopcounter); if(loopcounter == 500) break;*/ } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) ) aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ]; } #endif