Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +2 −2 Original line number Diff line number Diff line Loading @@ -69,7 +69,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > const MeshEntity& cell, const RealType velocity = 1.0 ); __cuda_callable__ bool updateCell( volatile Real sArray[18][18], __cuda_callable__ double updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy, const Real velocity = 1.0 ); Loading Loading @@ -117,7 +117,7 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, bool *BlockIterDevice ); Real *BlockIterDevice ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +15 −10 Original line number Diff line number Diff line Loading @@ -859,7 +859,7 @@ template< typename Real, typename Device, typename Index > __cuda_callable__ bool double tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy, const Real v ) Loading @@ -885,9 +885,9 @@ updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, con if( fabs( a ) == TypeInfo< RealType >::getMaxValue() && fabs( b ) == TypeInfo< RealType >::getMaxValue() ) return false; return 0; RealType pom[6] = { a, b, TypeInfo< Real >::getMaxValue(), (RealType)hx, (RealType)hy, TypeInfo< Real >::getMaxValue() }; RealType pom[6] = { a, b, TypeInfo< Real >::getMaxValue(), (RealType)hx, (RealType)hy, 0.0 }; sortMinims( pom ); tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; Loading @@ -895,9 +895,11 @@ updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, con if( fabs( tmp ) < fabs( pom[ 1 ] ) ) { sArray[ thrj ][ thri ] = argAbsMin( value, tmp ); if ( fabs( value - sArray[ thrj ][ thri ] ) > 0.001 ) return true; return false; tmp = value - sArray[ thrj ][ thri ]; if ( fabs( tmp ) > 0.1 ) return 10; else return 0; } else { Loading @@ -905,10 +907,13 @@ updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, con TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); sArray[ thrj ][ thri ] = argAbsMin( value, tmp ); if ( fabs( value - sArray[ thrj ][ thri ] ) > 0.001 ) return true; return false; tmp = value - sArray[ thrj ][ thri ]; if ( fabs( tmp ) > 0.1 ) return 10; else return 0; } return false; return 0; } #endif src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +135 −87 Original line number Diff line number Diff line Loading @@ -225,34 +225,41 @@ solve( const MeshPointer& mesh, Devices::Cuda::synchronizeDevice(); tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; int nBlockIter = numBlocksX > numBlocksY ? numBlocksX : numBlocksY; nBlockIter = numBlocksX == numBlocksY ? nBlockIter + 1 : nBlockIter; bool isNotDone = true; bool* BlockIter = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); bool *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ) ); /*int nBlockIter = numBlocksX > numBlocksY ? numBlocksX : numBlocksY; nBlockIter = numBlocksX == numBlocksY ? nBlockIter + 1 : nBlockIter;*/ while( isNotDone ) bool isNotDone = true; numBlocksX = 16; numBlocksY = 16; double* BlockIter = (double*)malloc( ( numBlocksX * numBlocksY ) * sizeof( double ) ); double *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( double ) ); //while( isNotDone ) { for( int i = 0; i < numBlocksX * numBlocksY; i++ ) BlockIter[ i ] = false; BlockIter[ i ] = 0.0; cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( double ), cudaMemcpyHostToDevice); isNotDone = false; CudaUpdateCellCaller<<< gridSize, blockSize, 18 * 18 * sizeof( Real ) >>>( ptr, CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( double ), cudaMemcpyDeviceToHost); for( int j = numBlocksY-1; j > -1; j-- ) { for( int i = 0; i < numBlocksX; i++ ) { for( int j = 0; j < numBlocksY; j++ ) { if( BlockIter[ j * numBlocksY + i ] ) std::cout << "true." << "\t"; else std::cout << "false." << "\t"; //if( BlockIter[ j * numBlocksX + i ] ) std::cout << BlockIter[ j * numBlocksY + i ] << "\t"; //else // std::cout << "0" << " "; } std::cout << std::endl; } Loading Loading @@ -282,76 +289,99 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, bool *BlockIterDevice ) Real *BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; // nelze ke stejnym pristupovat znovu pres threadIdx (vzdy da jine hodnoty) int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int i = thri + blockDim.x*blIdx; int j = blockDim.y*blIdy + thrj; int currentIndex = thrj * 16 + thri; __shared__ volatile Real changed[256]; changed[ currentIndex ] = 0.0; __syncthreads(); __shared__ volatile bool changed[256]; changed[ thrj * blockDim.x + thri ] = false; __shared__ volatile bool SmallCycleBoolin; if( thrj == 0 && thri == 0 ) SmallCycleBoolin = true; changed[ 0 ] = 10.0; __syncthreads(); //SmallCycleBoolin = true; const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); const Real hx = mesh.getSpaceSteps().x(); const Real hy = mesh.getSpaceSteps().y(); __shared__ Real hx; __shared__ Real hy; if( thrj == 0 && thri == 0 ) { hx = mesh.getSpaceSteps().x(); hy = mesh.getSpaceSteps().y(); } __syncthreads(); __shared__ volatile Real sArray[ 18 ][ 18 ]; sArray[thrj][thri] = TypeInfo< Real >::getMaxValue(); __syncthreads(); //filling sArray edges int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0); int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0); int xkolik = 17; int ykolik = 17; __shared__ volatile int numOfBlockx; __shared__ volatile int numOfBlocky; __shared__ int xkolik; __shared__ int ykolik; if( thri == 0 && thrj == 0 ) { xkolik = blockDim.x + 1; ykolik = blockDim.y + 1; numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); if( numOfBlockx - 1 == blIdx ) xkolik = dimX - (blIdx)*16+1; xkolik = dimX - (blIdx)*blockDim.x+1; if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*16+1; ykolik = dimY - (blIdy)*blockDim.y+1; //filling BlockIterDevice //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0.0; } /*if( blIdx == 2 && blIdy == 3 ) BlockIterDevice[ currentIndex ] = 0.0;*/ __syncthreads(); if( thri == 0 ) { if( dimX > (blIdx+1) * 16 && thrj+1 < ykolik ) sArray[thrj+1][xkolik] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + (thrj+1)*dimX + 17 ]; if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik ) sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; else sArray[thrj+1][xkolik] = TypeInfo< Real >::getMaxValue(); } __syncthreads(); if( thri == 1 ) { if( blIdx != 0 && thrj+1 < ykolik ) sArray[thrj+1][0] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + (thrj+1)*dimX + 0]; sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ]; else sArray[thrj+1][0] = TypeInfo< Real >::getMaxValue(); } __syncthreads(); if( thrj == 0 ) { if( dimY > (blIdy+1) * 16 && thri+1 < xkolik ) sArray[ykolik][thri+1] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + 17*dimX + thri+1 ]; if( dimY > (blIdy+1) * blockDim.y && thri+1 < xkolik ) sArray[ykolik][thri+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thri+1 ]; else sArray[ykolik][thri+1] = TypeInfo< Real >::getMaxValue(); } __syncthreads(); if( thrj == 1 ) { if( blIdy != 0 && thri+1 < xkolik ) sArray[0][thri+1] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + 0*dimX + thri+1 ]; sArray[0][thri+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thri+1 ]; else sArray[0][thri+1] = TypeInfo< Real >::getMaxValue(); } //filling BlockIterDevice BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = false; __syncthreads(); if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) Loading @@ -360,16 +390,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< __syncthreads(); } __shared__ int loopcounter; if( thri == 0 && thrj == 0 ) int loopcounter; loopcounter = 0; //if( blIdx == 5 && blIdy == 4 ) while( SmallCycleBoolin ) { if( thri == 1 && thrj == 1 ) SmallCycleBoolin = false; __syncthreads(); changed[ thrj * 16 + thri ] = false; /*if( ( blIdx == 4 && blIdy == 5 ) || ( blIdx == 2 && blIdy == 2 ) || ( blIdx == 2 && blIdy == 3 ) || ( blIdx == 5 && blIdy == 5 ) || ( blIdx == 6 && blIdy == 5 ) || ( blIdx == 5 && blIdy == 4 ) || ( blIdx == 5 && blIdy == 2 ) || ( blIdx == 3 && blIdy == 2 ) || ( blIdx == 4 && blIdy == 2 ) )*/ while( changed[ 0 ] > 0.1 ) { changed[ currentIndex] = 0.0; __syncthreads(); //calculation of update cell Loading @@ -377,53 +407,71 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< { if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) { changed[ thrj * 16 + thri ] = ptr.updateCell( sArray, thri+1, thrj+1, hx, hy ); changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx, hy ); __syncthreads(); } } __syncthreads(); //pyramid reduction if( blockDim.x*blockDim.y >= 256 ) { if( (thrj * 16 + thri) < 128 ) if( currentIndex < 128 ) { changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 128 ]; changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 128 ]; } __syncthreads(); } __syncthreads(); if( blockDim.x*blockDim.y >= 128 ) { if( (thrj * 16 + thri) < 64 ) if( currentIndex < 64 ) { changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 64 ]; changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 64 ]; } __syncthreads(); } if( (thrj * 16 + thri) < 32 ) __syncthreads(); if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU { changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 32 ]; changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 16 ]; changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 8 ]; changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 4 ]; changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 2 ]; changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 1 ]; if( true ) changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 32 ]; if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 16 ]; if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 8 ]; if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 4 ]; if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 2 ]; if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 1 ]; } __syncthreads(); if( thrj == 1 && thri == 1 ) { loopcounter++; if( loopcounter > 1000 ) break; SmallCycleBoolin = changed[ 0 ]; //if( SmallCycleBoolin ) BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = SmallCycleBoolin; __syncthreads(); if( currentIndex != 0 ) changed[ currentIndex ] = 0.0; __syncthreads(); if( loopcounter > 200 ) changed[ currentIndex ] = 0.0; __syncthreads(); } /*__syncthreads(); if( blIdy == 3 && blIdx == 1 ) { BlockIterDevice[ currentIndex ] = changed[ currentIndex ]; BlockIterDevice[ 0 ] = loopcounter; }*/ __syncthreads(); if( blIdx == 2 && blIdy == 3 ) { //SmallCycleBoolin = changed[ 0 ]; //if( changed[ 0 ] ) BlockIterDevice[ currentIndex ] = changed[ currentIndex ];// loopcounter; BlockIterDevice[ 0 ] = loopcounter; } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; __syncthreads(); } #endif Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +2 −2 Original line number Diff line number Diff line Loading @@ -69,7 +69,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > const MeshEntity& cell, const RealType velocity = 1.0 ); __cuda_callable__ bool updateCell( volatile Real sArray[18][18], __cuda_callable__ double updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy, const Real velocity = 1.0 ); Loading Loading @@ -117,7 +117,7 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, bool *BlockIterDevice ); Real *BlockIterDevice ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +15 −10 Original line number Diff line number Diff line Loading @@ -859,7 +859,7 @@ template< typename Real, typename Device, typename Index > __cuda_callable__ bool double tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy, const Real v ) Loading @@ -885,9 +885,9 @@ updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, con if( fabs( a ) == TypeInfo< RealType >::getMaxValue() && fabs( b ) == TypeInfo< RealType >::getMaxValue() ) return false; return 0; RealType pom[6] = { a, b, TypeInfo< Real >::getMaxValue(), (RealType)hx, (RealType)hy, TypeInfo< Real >::getMaxValue() }; RealType pom[6] = { a, b, TypeInfo< Real >::getMaxValue(), (RealType)hx, (RealType)hy, 0.0 }; sortMinims( pom ); tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; Loading @@ -895,9 +895,11 @@ updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, con if( fabs( tmp ) < fabs( pom[ 1 ] ) ) { sArray[ thrj ][ thri ] = argAbsMin( value, tmp ); if ( fabs( value - sArray[ thrj ][ thri ] ) > 0.001 ) return true; return false; tmp = value - sArray[ thrj ][ thri ]; if ( fabs( tmp ) > 0.1 ) return 10; else return 0; } else { Loading @@ -905,10 +907,13 @@ updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, con TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); sArray[ thrj ][ thri ] = argAbsMin( value, tmp ); if ( fabs( value - sArray[ thrj ][ thri ] ) > 0.001 ) return true; return false; tmp = value - sArray[ thrj ][ thri ]; if ( fabs( tmp ) > 0.1 ) return 10; else return 0; } return false; return 0; } #endif
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +135 −87 Original line number Diff line number Diff line Loading @@ -225,34 +225,41 @@ solve( const MeshPointer& mesh, Devices::Cuda::synchronizeDevice(); tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; int nBlockIter = numBlocksX > numBlocksY ? numBlocksX : numBlocksY; nBlockIter = numBlocksX == numBlocksY ? nBlockIter + 1 : nBlockIter; bool isNotDone = true; bool* BlockIter = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); bool *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ) ); /*int nBlockIter = numBlocksX > numBlocksY ? numBlocksX : numBlocksY; nBlockIter = numBlocksX == numBlocksY ? nBlockIter + 1 : nBlockIter;*/ while( isNotDone ) bool isNotDone = true; numBlocksX = 16; numBlocksY = 16; double* BlockIter = (double*)malloc( ( numBlocksX * numBlocksY ) * sizeof( double ) ); double *BlockIterDevice; cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( double ) ); //while( isNotDone ) { for( int i = 0; i < numBlocksX * numBlocksY; i++ ) BlockIter[ i ] = false; BlockIter[ i ] = 0.0; cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( double ), cudaMemcpyHostToDevice); isNotDone = false; CudaUpdateCellCaller<<< gridSize, blockSize, 18 * 18 * sizeof( Real ) >>>( ptr, CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( double ), cudaMemcpyDeviceToHost); for( int j = numBlocksY-1; j > -1; j-- ) { for( int i = 0; i < numBlocksX; i++ ) { for( int j = 0; j < numBlocksY; j++ ) { if( BlockIter[ j * numBlocksY + i ] ) std::cout << "true." << "\t"; else std::cout << "false." << "\t"; //if( BlockIter[ j * numBlocksX + i ] ) std::cout << BlockIter[ j * numBlocksY + i ] << "\t"; //else // std::cout << "0" << " "; } std::cout << std::endl; } Loading Loading @@ -282,76 +289,99 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, bool *BlockIterDevice ) Real *BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; // nelze ke stejnym pristupovat znovu pres threadIdx (vzdy da jine hodnoty) int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int i = thri + blockDim.x*blIdx; int j = blockDim.y*blIdy + thrj; int currentIndex = thrj * 16 + thri; __shared__ volatile Real changed[256]; changed[ currentIndex ] = 0.0; __syncthreads(); __shared__ volatile bool changed[256]; changed[ thrj * blockDim.x + thri ] = false; __shared__ volatile bool SmallCycleBoolin; if( thrj == 0 && thri == 0 ) SmallCycleBoolin = true; changed[ 0 ] = 10.0; __syncthreads(); //SmallCycleBoolin = true; const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); const Real hx = mesh.getSpaceSteps().x(); const Real hy = mesh.getSpaceSteps().y(); __shared__ Real hx; __shared__ Real hy; if( thrj == 0 && thri == 0 ) { hx = mesh.getSpaceSteps().x(); hy = mesh.getSpaceSteps().y(); } __syncthreads(); __shared__ volatile Real sArray[ 18 ][ 18 ]; sArray[thrj][thri] = TypeInfo< Real >::getMaxValue(); __syncthreads(); //filling sArray edges int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0); int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0); int xkolik = 17; int ykolik = 17; __shared__ volatile int numOfBlockx; __shared__ volatile int numOfBlocky; __shared__ int xkolik; __shared__ int ykolik; if( thri == 0 && thrj == 0 ) { xkolik = blockDim.x + 1; ykolik = blockDim.y + 1; numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); if( numOfBlockx - 1 == blIdx ) xkolik = dimX - (blIdx)*16+1; xkolik = dimX - (blIdx)*blockDim.x+1; if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*16+1; ykolik = dimY - (blIdy)*blockDim.y+1; //filling BlockIterDevice //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0.0; } /*if( blIdx == 2 && blIdy == 3 ) BlockIterDevice[ currentIndex ] = 0.0;*/ __syncthreads(); if( thri == 0 ) { if( dimX > (blIdx+1) * 16 && thrj+1 < ykolik ) sArray[thrj+1][xkolik] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + (thrj+1)*dimX + 17 ]; if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik ) sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; else sArray[thrj+1][xkolik] = TypeInfo< Real >::getMaxValue(); } __syncthreads(); if( thri == 1 ) { if( blIdx != 0 && thrj+1 < ykolik ) sArray[thrj+1][0] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + (thrj+1)*dimX + 0]; sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ]; else sArray[thrj+1][0] = TypeInfo< Real >::getMaxValue(); } __syncthreads(); if( thrj == 0 ) { if( dimY > (blIdy+1) * 16 && thri+1 < xkolik ) sArray[ykolik][thri+1] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + 17*dimX + thri+1 ]; if( dimY > (blIdy+1) * blockDim.y && thri+1 < xkolik ) sArray[ykolik][thri+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thri+1 ]; else sArray[ykolik][thri+1] = TypeInfo< Real >::getMaxValue(); } __syncthreads(); if( thrj == 1 ) { if( blIdy != 0 && thri+1 < xkolik ) sArray[0][thri+1] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + 0*dimX + thri+1 ]; sArray[0][thri+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thri+1 ]; else sArray[0][thri+1] = TypeInfo< Real >::getMaxValue(); } //filling BlockIterDevice BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = false; __syncthreads(); if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) Loading @@ -360,16 +390,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< __syncthreads(); } __shared__ int loopcounter; if( thri == 0 && thrj == 0 ) int loopcounter; loopcounter = 0; //if( blIdx == 5 && blIdy == 4 ) while( SmallCycleBoolin ) { if( thri == 1 && thrj == 1 ) SmallCycleBoolin = false; __syncthreads(); changed[ thrj * 16 + thri ] = false; /*if( ( blIdx == 4 && blIdy == 5 ) || ( blIdx == 2 && blIdy == 2 ) || ( blIdx == 2 && blIdy == 3 ) || ( blIdx == 5 && blIdy == 5 ) || ( blIdx == 6 && blIdy == 5 ) || ( blIdx == 5 && blIdy == 4 ) || ( blIdx == 5 && blIdy == 2 ) || ( blIdx == 3 && blIdy == 2 ) || ( blIdx == 4 && blIdy == 2 ) )*/ while( changed[ 0 ] > 0.1 ) { changed[ currentIndex] = 0.0; __syncthreads(); //calculation of update cell Loading @@ -377,53 +407,71 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< { if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) { changed[ thrj * 16 + thri ] = ptr.updateCell( sArray, thri+1, thrj+1, hx, hy ); changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx, hy ); __syncthreads(); } } __syncthreads(); //pyramid reduction if( blockDim.x*blockDim.y >= 256 ) { if( (thrj * 16 + thri) < 128 ) if( currentIndex < 128 ) { changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 128 ]; changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 128 ]; } __syncthreads(); } __syncthreads(); if( blockDim.x*blockDim.y >= 128 ) { if( (thrj * 16 + thri) < 64 ) if( currentIndex < 64 ) { changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 64 ]; changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 64 ]; } __syncthreads(); } if( (thrj * 16 + thri) < 32 ) __syncthreads(); if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU { changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 32 ]; changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 16 ]; changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 8 ]; changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 4 ]; changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 2 ]; changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 1 ]; if( true ) changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 32 ]; if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 16 ]; if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 8 ]; if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 4 ]; if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 2 ]; if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] + changed[ currentIndex + 1 ]; } __syncthreads(); if( thrj == 1 && thri == 1 ) { loopcounter++; if( loopcounter > 1000 ) break; SmallCycleBoolin = changed[ 0 ]; //if( SmallCycleBoolin ) BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = SmallCycleBoolin; __syncthreads(); if( currentIndex != 0 ) changed[ currentIndex ] = 0.0; __syncthreads(); if( loopcounter > 200 ) changed[ currentIndex ] = 0.0; __syncthreads(); } /*__syncthreads(); if( blIdy == 3 && blIdx == 1 ) { BlockIterDevice[ currentIndex ] = changed[ currentIndex ]; BlockIterDevice[ 0 ] = loopcounter; }*/ __syncthreads(); if( blIdx == 2 && blIdy == 3 ) { //SmallCycleBoolin = changed[ 0 ]; //if( changed[ 0 ] ) BlockIterDevice[ currentIndex ] = changed[ currentIndex ];// loopcounter; BlockIterDevice[ 0 ] = loopcounter; } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; __syncthreads(); } #endif