Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +1 −1 Original line number Diff line number Diff line Loading @@ -148,7 +148,7 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I template < typename Index > __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY ); TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +36 −36 Original line number Diff line number Diff line Loading @@ -134,6 +134,7 @@ updateBlocks( InterfaceMapType interfaceMap, Real hy = mesh.getSpaceSteps().y(); bool changed = false; BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; Real *sArray; Loading @@ -143,41 +144,40 @@ updateBlocks( InterfaceMapType interfaceMap, for( int thri = 0; thri < sizeSArray; thri++ ){ for( int thrj = 0; thrj < sizeSArray; thrj++ ) sArray/*[i]*/[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max(); sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max(); } BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ ) { if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik ) sArray/*[i]*/[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; if( blIdx != 0 && thrj+1 < ykolik ) sArray/*[i]*/[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik ) sArray/*[i]*/[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; if( blIdy != 0 && thrj+1 < xkolik ) sArray/*[i]*/[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; } for( int k = 0; k < numThreadsPerBlock; k++ ){ for( int l = 0; l < numThreadsPerBlock; l++ ) if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) sArray/*[i]*/[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; } bool pom = false; for( int k = 0; k < numThreadsPerBlock; k++ ){ for( int l = 0; l < numThreadsPerBlock; l++ ){ if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){ //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl; if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { pom = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy); changed = changed || pom; changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed; } } } Loading Loading @@ -254,7 +254,7 @@ updateBlocks( InterfaceMapType interfaceMap, for( int k = 0; k < numThreadsPerBlock; k++ ){ for( int l = 0; l < numThreadsPerBlock; l++ ) { if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray/*[i]*/[ (k + 1)* sizeSArray + l + 1 ]; helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ]; //std::cout<< sArray[k+1][l+1]; } //std::cout<<std::endl; Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +88 −77 Original line number Diff line number Diff line Loading @@ -123,6 +123,7 @@ solve( const MeshPointer& mesh, helpFunc = helpFunc1; this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); //Reduction for( int i = 0; i < BlockIterHost.getSize(); i++ ){ if( IsCalculationDone == 0 ){ IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; Loading @@ -130,6 +131,7 @@ solve( const MeshPointer& mesh, } } numWhile++; std::cout <<"numWhile = "<< numWhile <<std::endl; for( int j = numBlocksY-1; j>-1; j-- ){ for( int i = 0; i < numBlocksX; i++ ) Loading @@ -146,7 +148,6 @@ solve( const MeshPointer& mesh, std::cout << std::endl; } std::cout << std::endl;*/ //Reduction //std::cout<<std::endl; string s( "aux-"+ std::to_string(numWhile) + ".tnl"); Loading Loading @@ -261,9 +262,9 @@ solve( const MeshPointer& mesh, TNL_CHECK_CUDA_DEVICE; /*TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; BlockIterPom.setSize( numBlocksX * numBlocksY ); BlockIterPom.setValue( 0 );*/ BlockIterPom.setValue( 0 ); /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1; BlockIterPom1.setSize( numBlocksX * numBlocksY ); BlockIterPom1.setValue( 0 );*/ Loading @@ -284,9 +285,7 @@ solve( const MeshPointer& mesh, cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/ MeshFunctionPointer helpFunc1; helpFunc1->setMesh(mesh); MeshFunctionPointer helpFunc1( mesh ); MeshFunctionPointer helpFunc( mesh ); helpFunc1 = auxPtr; Loading Loading @@ -343,6 +342,7 @@ solve( const MeshPointer& mesh, helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; TNL_CHECK_CUDA_DEVICE; //int pocBloku = 0; Devices::Cuda::synchronizeDevice(); Loading @@ -355,10 +355,20 @@ solve( const MeshPointer& mesh, TNL_CHECK_CUDA_DEVICE; //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl; //BlockIterPom1 = BlockIterDevice; ///for( int i =0; i< numBlocksX; i++ ){ // for( int j = 0; j < numBlocksY; j++ ) // { // std::cout << BlockIterPom1[j*numBlocksX + i]; // } // std::cout << std::endl; //} //std::cout << std::endl; GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, numBlocksX, numBlocksY ); GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; BlockIterDevice = BlockIterPom; //std::cout<< "Probehlo" << std::endl; Loading Loading @@ -392,7 +402,6 @@ solve( const MeshPointer& mesh, cudaFree( dBlock ); delete BlockIter;*/ cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; Loading @@ -410,7 +419,7 @@ solve( const MeshPointer& mesh, template < typename Index > __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY ) TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ) { int i = blockIdx.x * 1024 + threadIdx.x; Loading @@ -430,7 +439,7 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index pom = 1;//BlockIterPom[ i ] = 1; } BlockIterDevice[ i ] = pom;//BlockIterPom[ i ]; BlockIterPom[ i ] = pom;//BlockIterPom[ i ]; } } Loading Loading @@ -514,14 +523,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; /** FOR CHESS METHOD */ if( (blockIdx.y%2 + blockIdx.x) % 2 == oddEvenBlock ) { /**-----------------------------------------*/ //if( (blockIdx.y%2 + blockIdx.x) % 2 == oddEvenBlock ) //{ /**------------------------------------------*/ /** FOR FIM METHOD */ /*if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x] ) {*/ if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] ) { __syncthreads(); /**-----------------------------------------*/ const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); __shared__ volatile int dimX; Loading Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +1 −1 Original line number Diff line number Diff line Loading @@ -148,7 +148,7 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I template < typename Index > __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY ); TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +36 −36 Original line number Diff line number Diff line Loading @@ -134,6 +134,7 @@ updateBlocks( InterfaceMapType interfaceMap, Real hy = mesh.getSpaceSteps().y(); bool changed = false; BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; Real *sArray; Loading @@ -143,41 +144,40 @@ updateBlocks( InterfaceMapType interfaceMap, for( int thri = 0; thri < sizeSArray; thri++ ){ for( int thrj = 0; thrj < sizeSArray; thrj++ ) sArray/*[i]*/[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max(); sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max(); } BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ ) { if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik ) sArray/*[i]*/[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; if( blIdx != 0 && thrj+1 < ykolik ) sArray/*[i]*/[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik ) sArray/*[i]*/[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; if( blIdy != 0 && thrj+1 < xkolik ) sArray/*[i]*/[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; } for( int k = 0; k < numThreadsPerBlock; k++ ){ for( int l = 0; l < numThreadsPerBlock; l++ ) if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) sArray/*[i]*/[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; } bool pom = false; for( int k = 0; k < numThreadsPerBlock; k++ ){ for( int l = 0; l < numThreadsPerBlock; l++ ){ if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){ //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl; if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { pom = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy); changed = changed || pom; changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed; } } } Loading Loading @@ -254,7 +254,7 @@ updateBlocks( InterfaceMapType interfaceMap, for( int k = 0; k < numThreadsPerBlock; k++ ){ for( int l = 0; l < numThreadsPerBlock; l++ ) { if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray/*[i]*/[ (k + 1)* sizeSArray + l + 1 ]; helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ]; //std::cout<< sArray[k+1][l+1]; } //std::cout<<std::endl; Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +88 −77 Original line number Diff line number Diff line Loading @@ -123,6 +123,7 @@ solve( const MeshPointer& mesh, helpFunc = helpFunc1; this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); //Reduction for( int i = 0; i < BlockIterHost.getSize(); i++ ){ if( IsCalculationDone == 0 ){ IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; Loading @@ -130,6 +131,7 @@ solve( const MeshPointer& mesh, } } numWhile++; std::cout <<"numWhile = "<< numWhile <<std::endl; for( int j = numBlocksY-1; j>-1; j-- ){ for( int i = 0; i < numBlocksX; i++ ) Loading @@ -146,7 +148,6 @@ solve( const MeshPointer& mesh, std::cout << std::endl; } std::cout << std::endl;*/ //Reduction //std::cout<<std::endl; string s( "aux-"+ std::to_string(numWhile) + ".tnl"); Loading Loading @@ -261,9 +262,9 @@ solve( const MeshPointer& mesh, TNL_CHECK_CUDA_DEVICE; /*TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; BlockIterPom.setSize( numBlocksX * numBlocksY ); BlockIterPom.setValue( 0 );*/ BlockIterPom.setValue( 0 ); /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1; BlockIterPom1.setSize( numBlocksX * numBlocksY ); BlockIterPom1.setValue( 0 );*/ Loading @@ -284,9 +285,7 @@ solve( const MeshPointer& mesh, cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/ MeshFunctionPointer helpFunc1; helpFunc1->setMesh(mesh); MeshFunctionPointer helpFunc1( mesh ); MeshFunctionPointer helpFunc( mesh ); helpFunc1 = auxPtr; Loading Loading @@ -343,6 +342,7 @@ solve( const MeshPointer& mesh, helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; TNL_CHECK_CUDA_DEVICE; //int pocBloku = 0; Devices::Cuda::synchronizeDevice(); Loading @@ -355,10 +355,20 @@ solve( const MeshPointer& mesh, TNL_CHECK_CUDA_DEVICE; //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl; //BlockIterPom1 = BlockIterDevice; ///for( int i =0; i< numBlocksX; i++ ){ // for( int j = 0; j < numBlocksY; j++ ) // { // std::cout << BlockIterPom1[j*numBlocksX + i]; // } // std::cout << std::endl; //} //std::cout << std::endl; GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, numBlocksX, numBlocksY ); GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; BlockIterDevice = BlockIterPom; //std::cout<< "Probehlo" << std::endl; Loading Loading @@ -392,7 +402,6 @@ solve( const MeshPointer& mesh, cudaFree( dBlock ); delete BlockIter;*/ cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; Loading @@ -410,7 +419,7 @@ solve( const MeshPointer& mesh, template < typename Index > __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY ) TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ) { int i = blockIdx.x * 1024 + threadIdx.x; Loading @@ -430,7 +439,7 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index pom = 1;//BlockIterPom[ i ] = 1; } BlockIterDevice[ i ] = pom;//BlockIterPom[ i ]; BlockIterPom[ i ] = pom;//BlockIterPom[ i ]; } } Loading Loading @@ -514,14 +523,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; /** FOR CHESS METHOD */ if( (blockIdx.y%2 + blockIdx.x) % 2 == oddEvenBlock ) { /**-----------------------------------------*/ //if( (blockIdx.y%2 + blockIdx.x) % 2 == oddEvenBlock ) //{ /**------------------------------------------*/ /** FOR FIM METHOD */ /*if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x] ) {*/ if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] ) { __syncthreads(); /**-----------------------------------------*/ const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); __shared__ volatile int dimX; Loading