Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +8 −2 Original line number Diff line number Diff line Loading @@ -160,11 +160,17 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3 Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ); template < typename Real, typename Device, typename Index > template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); template < typename Index > __global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY, int numBlockZ ); #endif #include "tnlDirectEikonalMethodsBase_impl.h" src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +7 −9 Original line number Diff line number Diff line Loading @@ -85,7 +85,7 @@ solve( const MeshPointer& mesh, { if( std::is_same< DeviceType, Devices::Host >::value ) { int numThreadsPerBlock = 1024; int numThreadsPerBlock = 16; int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); Loading Loading @@ -115,13 +115,13 @@ solve( const MeshPointer& mesh, } std::cout<<std::endl;*/ unsigned int numWhile = 0; while( IsCalculationDone ) while( IsCalculationDone && numWhile < 1 ) { IsCalculationDone = 0; helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); //Reduction for( int i = 0; i < BlockIterHost.getSize(); i++ ){ Loading Loading @@ -394,9 +394,7 @@ solve( const MeshPointer& mesh, numIter ++; } if( numIter == 1 ){ helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; } /*cudaFree( BlockIterDevice ); cudaFree( dBlock ); Loading Loading @@ -535,10 +533,10 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< __syncthreads(); /**-----------------------------------------*/ const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); __shared__ volatile int dimX; __shared__ volatile int dimY; __shared__ volatile Real hx; __shared__ volatile Real hy; __shared__ int dimX; __shared__ int dimY; __shared__ Real hx; __shared__ Real hy; if( thri==0 && thrj == 0) { dimX = mesh.getDimensions().x(); Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +463 −418 Original line number Diff line number Diff line Loading @@ -264,6 +264,9 @@ solve( const MeshPointer& mesh, TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ ); BlockIterDevice.setValue( 1 ); TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; BlockIterPom.setSize( numBlocksX * numBlocksY * numBlocksZ ); BlockIterPom.setValue( 0 ); /*int *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/ int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); Loading @@ -271,18 +274,38 @@ solve( const MeshPointer& mesh, TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; dBlock.setSize( nBlocks ); dBlock.setValue( 0 ); int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0); /*int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ MeshFunctionPointer helpFunc1( mesh ); MeshFunctionPointer helpFunc( mesh ); helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; int numIter = 0; while( BlockIterD ) { CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; TNL_CHECK_CUDA_DEVICE; CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), BlockIterDevice ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY, numBlocksZ ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; BlockIterDevice = BlockIterPom; CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; Loading @@ -291,11 +314,14 @@ solve( const MeshPointer& mesh, cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); numIter++; /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } if( numIter == 1 ){ auxPtr = helpFunc; } //cudaFree( BlockIterDevice ); //cudaFree( dBlock ); cudaDeviceSynchronize(); Loading @@ -313,10 +339,43 @@ solve( const MeshPointer& mesh, } #ifdef HAVE_CUDA template < typename Real, typename Device, typename Index > template < typename Index > __global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY, int numBlockZ ) { int i = blockIdx.x * 1024 + threadIdx.x; if( i < numBlockX * numBlockY * numBlockZ ) { int pom = 0;//BlockIterPom[ i ] = 0; int m=0, l=0, k=0; l = i/( numBlockX * numBlockY ); k = (i-l*numBlockX * numBlockY )/(numBlockX ); m = (i-l*numBlockX * numBlockY )%( numBlockX ); if( m > 0 && BlockIterDevice[ i - 1 ] ){ pom = 1;//BlockIterPom[ i ] = 1; }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ pom = 1;//BlockIterPom[ i ] = 1; }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ pom = 1;// BlockIterPom[ i ] = 1; }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ pom = 1;//BlockIterPom[ i ] = 1; }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ pom = 1; }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ pom = 1; } BlockIterPom[ i ] = pom;//BlockIterPom[ i ]; } } template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; Loading @@ -326,62 +385,54 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< int k = blockDim.z*blockIdx.z + threadIdx.z; int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; __shared__ volatile bool changed[8*8*8]; changed[ currentIndex ] = false; if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) { __syncthreads(); __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)]; changed[ currentIndex ] = false; if( thrj == 0 && thri == 0 && thrk == 0 ) changed[ 0 ] = true; const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); __shared__ Real hx; __shared__ Real hy; __shared__ Real hz; __shared__ Real hx; __shared__ int dimX; __shared__ Real hy; __shared__ int dimY; __shared__ Real hz; __shared__ int dimZ; if( thrj == 1 && thri == 1 && thrk == 1 ) { hx = mesh.getSpaceSteps().x(); hy = mesh.getSpaceSteps().y(); hz = mesh.getSpaceSteps().z(); dimX = mesh.getDimensions().x(); dimY = mesh.getDimensions().y(); dimZ = mesh.getDimensions().z(); BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0; } __shared__ volatile Real sArray[10][10][10]; sArray[thrk][thrj][thri] = std::numeric_limits< Real >::max(); if(thri == 0 ) { sArray[8][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); sArray[thrk+1][thrj+1][8] = std::numeric_limits< Real >::max(); sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max(); sArray[thrj+1][8][thrk+1] = std::numeric_limits< Real >::max(); sArray[thrj+1][9][thrk+1] = std::numeric_limits< Real >::max(); } __shared__ volatile Real sArray[sizeSArray][sizeSArray][sizeSArray]; sArray[thrk+1][thrj+1][thri+1] = std::numeric_limits< Real >::max(); //filling sArray edges int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); int dimZ = mesh.getDimensions().z(); __shared__ volatile int numOfBlockx; __shared__ volatile int numOfBlocky; __shared__ volatile int numOfBlockz; __shared__ int xkolik; __shared__ int ykolik; __shared__ int zkolik; if( thri == 0 && thrj == 0 && thrk == 0 ) { int numOfBlockx; int numOfBlocky; int numOfBlockz; int xkolik; int ykolik; int zkolik; xkolik = blockDim.x + 1; ykolik = blockDim.y + 1; zkolik = blockDim.z + 1; numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0); numOfBlockx = gridDim.x; numOfBlocky = gridDim.y; numOfBlockz = gridDim.z; if( numOfBlockx - 1 == blIdx ) xkolik = dimX - (blIdx)*blockDim.x+1; if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*blockDim.y+1; if( numOfBlockz-1 == blIdz ) zkolik = dimZ - (blIdz)*blockDim.z+1; BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0; } __syncthreads(); if( thri == 0 ) Loading Loading @@ -430,12 +481,10 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) if( i < dimX && j < dimY && k < dimZ ) { sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; } __shared__ volatile int loopcounter; loopcounter = 0; __syncthreads(); while( changed[ 0 ] ) { Loading Loading @@ -510,17 +559,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< }*/ if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) { //loopcounter++; BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1; } __syncthreads(); /*if(thri == 0 && thrj==0 && thrk==0) printf("%i \n",loopcounter); if(loopcounter == 500) break;*/ } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) ) aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ]; if( i < dimX && j < dimY && k < dimZ ) helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ]; } } #endif Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +8 −2 Original line number Diff line number Diff line Loading @@ -160,11 +160,17 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3 Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ); template < typename Real, typename Device, typename Index > template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); template < typename Index > __global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY, int numBlockZ ); #endif #include "tnlDirectEikonalMethodsBase_impl.h"
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +7 −9 Original line number Diff line number Diff line Loading @@ -85,7 +85,7 @@ solve( const MeshPointer& mesh, { if( std::is_same< DeviceType, Devices::Host >::value ) { int numThreadsPerBlock = 1024; int numThreadsPerBlock = 16; int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); Loading Loading @@ -115,13 +115,13 @@ solve( const MeshPointer& mesh, } std::cout<<std::endl;*/ unsigned int numWhile = 0; while( IsCalculationDone ) while( IsCalculationDone && numWhile < 1 ) { IsCalculationDone = 0; helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); //Reduction for( int i = 0; i < BlockIterHost.getSize(); i++ ){ Loading Loading @@ -394,9 +394,7 @@ solve( const MeshPointer& mesh, numIter ++; } if( numIter == 1 ){ helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; } /*cudaFree( BlockIterDevice ); cudaFree( dBlock ); Loading Loading @@ -535,10 +533,10 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< __syncthreads(); /**-----------------------------------------*/ const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); __shared__ volatile int dimX; __shared__ volatile int dimY; __shared__ volatile Real hx; __shared__ volatile Real hy; __shared__ int dimX; __shared__ int dimY; __shared__ Real hx; __shared__ Real hy; if( thri==0 && thrj == 0) { dimX = mesh.getDimensions().x(); Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +463 −418 Original line number Diff line number Diff line Loading @@ -264,6 +264,9 @@ solve( const MeshPointer& mesh, TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ ); BlockIterDevice.setValue( 1 ); TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; BlockIterPom.setSize( numBlocksX * numBlocksY * numBlocksZ ); BlockIterPom.setValue( 0 ); /*int *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/ int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); Loading @@ -271,18 +274,38 @@ solve( const MeshPointer& mesh, TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; dBlock.setSize( nBlocks ); dBlock.setValue( 0 ); int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0); /*int *dBlock; cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ MeshFunctionPointer helpFunc1( mesh ); MeshFunctionPointer helpFunc( mesh ); helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; int numIter = 0; while( BlockIterD ) { CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; TNL_CHECK_CUDA_DEVICE; CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), BlockIterDevice ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY, numBlocksZ ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; BlockIterDevice = BlockIterPom; CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; Loading @@ -291,11 +314,14 @@ solve( const MeshPointer& mesh, cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); numIter++; /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } if( numIter == 1 ){ auxPtr = helpFunc; } //cudaFree( BlockIterDevice ); //cudaFree( dBlock ); cudaDeviceSynchronize(); Loading @@ -313,10 +339,43 @@ solve( const MeshPointer& mesh, } #ifdef HAVE_CUDA template < typename Real, typename Device, typename Index > template < typename Index > __global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY, int numBlockZ ) { int i = blockIdx.x * 1024 + threadIdx.x; if( i < numBlockX * numBlockY * numBlockZ ) { int pom = 0;//BlockIterPom[ i ] = 0; int m=0, l=0, k=0; l = i/( numBlockX * numBlockY ); k = (i-l*numBlockX * numBlockY )/(numBlockX ); m = (i-l*numBlockX * numBlockY )%( numBlockX ); if( m > 0 && BlockIterDevice[ i - 1 ] ){ pom = 1;//BlockIterPom[ i ] = 1; }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ pom = 1;//BlockIterPom[ i ] = 1; }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ pom = 1;// BlockIterPom[ i ] = 1; }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ pom = 1;//BlockIterPom[ i ] = 1; }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ pom = 1; }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ pom = 1; } BlockIterPom[ i ] = pom;//BlockIterPom[ i ]; } } template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; Loading @@ -326,62 +385,54 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< int k = blockDim.z*blockIdx.z + threadIdx.z; int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; __shared__ volatile bool changed[8*8*8]; changed[ currentIndex ] = false; if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) { __syncthreads(); __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)]; changed[ currentIndex ] = false; if( thrj == 0 && thri == 0 && thrk == 0 ) changed[ 0 ] = true; const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); __shared__ Real hx; __shared__ Real hy; __shared__ Real hz; __shared__ Real hx; __shared__ int dimX; __shared__ Real hy; __shared__ int dimY; __shared__ Real hz; __shared__ int dimZ; if( thrj == 1 && thri == 1 && thrk == 1 ) { hx = mesh.getSpaceSteps().x(); hy = mesh.getSpaceSteps().y(); hz = mesh.getSpaceSteps().z(); dimX = mesh.getDimensions().x(); dimY = mesh.getDimensions().y(); dimZ = mesh.getDimensions().z(); BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0; } __shared__ volatile Real sArray[10][10][10]; sArray[thrk][thrj][thri] = std::numeric_limits< Real >::max(); if(thri == 0 ) { sArray[8][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); sArray[thrk+1][thrj+1][8] = std::numeric_limits< Real >::max(); sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max(); sArray[thrj+1][8][thrk+1] = std::numeric_limits< Real >::max(); sArray[thrj+1][9][thrk+1] = std::numeric_limits< Real >::max(); } __shared__ volatile Real sArray[sizeSArray][sizeSArray][sizeSArray]; sArray[thrk+1][thrj+1][thri+1] = std::numeric_limits< Real >::max(); //filling sArray edges int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); int dimZ = mesh.getDimensions().z(); __shared__ volatile int numOfBlockx; __shared__ volatile int numOfBlocky; __shared__ volatile int numOfBlockz; __shared__ int xkolik; __shared__ int ykolik; __shared__ int zkolik; if( thri == 0 && thrj == 0 && thrk == 0 ) { int numOfBlockx; int numOfBlocky; int numOfBlockz; int xkolik; int ykolik; int zkolik; xkolik = blockDim.x + 1; ykolik = blockDim.y + 1; zkolik = blockDim.z + 1; numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0); numOfBlockx = gridDim.x; numOfBlocky = gridDim.y; numOfBlockz = gridDim.z; if( numOfBlockx - 1 == blIdx ) xkolik = dimX - (blIdx)*blockDim.x+1; if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*blockDim.y+1; if( numOfBlockz-1 == blIdz ) zkolik = dimZ - (blIdz)*blockDim.z+1; BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0; } __syncthreads(); if( thri == 0 ) Loading Loading @@ -430,12 +481,10 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) if( i < dimX && j < dimY && k < dimZ ) { sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; } __shared__ volatile int loopcounter; loopcounter = 0; __syncthreads(); while( changed[ 0 ] ) { Loading Loading @@ -510,17 +559,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< }*/ if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) { //loopcounter++; BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1; } __syncthreads(); /*if(thri == 0 && thrj==0 && thrk==0) printf("%i \n",loopcounter); if(loopcounter == 500) break;*/ } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) ) aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ]; if( i < dimX && j < dimY && k < dimZ ) helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ]; } } #endif