Loading src/mesh/grids/tnlTraverser_Grid2D_impl.h +7 −7 Original line number Diff line number Diff line Loading @@ -271,16 +271,16 @@ __global__ void tnlTraverserGrid2DCells( const tnlGrid< 2, Real, tnlCuda, Index const int CellDimensions = GridType::Dimensions; typename GridType::template GridEntity< CellDimensions > entity( *grid ); typedef typename GridType::CoordinatesType CoordinatesType; CoordinatesType& coordinates = entity.getCoordinates(); //CoordinatesType& coordinates = entity.getCoordinates(); const Index& xSize = grid->getDimensions().x(); const Index& ySize = grid->getDimensions().y(); /*const Index& xSize = grid->getDimensions().x(); const Index& ySize = grid->getDimensions().y();*/ coordinates.x() = ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.y() = ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; entity.getCoordinates().x() = ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; entity.getCoordinates().y() = ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( coordinates.x() < grid->getDimensions().x() && coordinates.y() < grid->getDimensions().y() ) if( entity.getCoordinates().x() < grid->getDimensions().x() && entity.getCoordinates().y() < grid->getDimensions().y() ) { entity.setIndex( grid->getEntityIndex( entity ) ); if( processAllEntities || entity.isBoundaryEntity() == processBoundaryEntities ) Loading src/solvers/ode/tnlEulerSolver_impl.h +0 −1 Original line number Diff line number Diff line Loading @@ -158,7 +158,6 @@ bool tnlEulerSolver< Problem > :: solve( DofVectorType& u ) currentTau = Min( currentTau, this->getMaxTau() ); } } return false; }; template< typename Problem > Loading tests/benchmarks/tnl-benchmark-simple-heat-equation.h +265 −30 Original line number Diff line number Diff line Loading @@ -28,20 +28,178 @@ using namespace std; #ifdef HAVE_CUDA template< typename Real, typename Index > __device__ void computeBlockResidue( Real* du, Real* blockResidue, Index n ) { if( n == 128 || n == 64 || n == 32 || n == 16 || n == 8 || n == 4 || n == 2 || n == 256 || n == 512 ) { if( blockDim.x >= 512 ) { if( threadIdx.x < 256 ) du[ threadIdx.x ] += du[ threadIdx.x + 256 ]; __syncthreads(); } if( blockDim.x >= 256 ) { if( threadIdx.x < 128 ) du[ threadIdx.x ] += du[ threadIdx.x + 128 ]; __syncthreads(); } if( blockDim.x >= 128 ) { if( threadIdx.x < 64 ) du[ threadIdx.x ] += du[ threadIdx.x + 64 ]; __syncthreads(); } /*** * This runs in one warp so it is synchronized implicitly. */ if ( threadIdx.x < 32) { if( blockDim.x >= 64 ) du[ threadIdx.x ] += du[ threadIdx.x + 32 ]; if( blockDim.x >= 32 ) du[ threadIdx.x ] += du[ threadIdx.x + 16 ]; if( blockDim.x >= 16 ) du[ threadIdx.x ] += du[ threadIdx.x + 8 ]; if( blockDim.x >= 8 ) du[ threadIdx.x ] += du[ threadIdx.x + 4 ]; if( blockDim.x >= 4 ) du[ threadIdx.x ] += du[ threadIdx.x + 2 ]; if( blockDim.x >= 2 ) du[ threadIdx.x ] += du[ threadIdx.x + 1 ]; } } else { int s; if( n >= 512 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; __syncthreads(); if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; __syncthreads(); } if( n >= 256 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; __syncthreads(); if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; __syncthreads(); } if( n >= 128 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; __syncthreads(); if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; __syncthreads(); } if( n >= 64 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; __syncthreads(); if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; __syncthreads(); } if( n >= 32 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; __syncthreads(); if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; __syncthreads(); } /*** * This runs in one warp so it is synchronised implicitly. */ if( n >= 16 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; } if( n >= 8 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; } if( n >= 4 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; } if( n >= 2 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; } } if( threadIdx.x == 0 ) blockResidue[ blockIdx.x ] = du[ 0 ]; } template< typename Real, typename Index > __global__ void boundaryConditionsKernel( const Real* u, Real* aux, const Index gridXSize, const Index gridYSize ) { const Index i = ( blockIdx.x ) * blockDim.x + threadIdx.x; const Index j = ( blockIdx.y ) * blockDim.y + threadIdx.y; if( i == 0 && j < gridYSize ) /*if( i == 0 && j < gridYSize ) aux[ j * gridXSize ] = 0.0; //u[ j * gridXSize + 1 ]; if( i == gridXSize - 1 && j < gridYSize ) /*if( i == gridXSize - 1 && j < gridYSize ) aux[ j * gridXSize + gridXSize - 2 ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ]; if( j == 0 && i < gridXSize ) aux[ j * gridXSize ] = 0.0; //u[ j * gridXSize + 1 ]; if( j == gridYSize -1 && i < gridXSize ) aux[ j * gridXSize + gridXSize - 2 ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ]; */ } Loading @@ -59,20 +217,58 @@ __global__ void heatEquationKernel( const Real* u, if( i > 0 && i < gridXSize - 1 && j > 0 && j < gridYSize - 1 ) { printf( "( %d, %d ) ", i, j ); const Index c = j * gridXSize + i; aux[ c ] = u[ c ] + tau * ( ( u[ c - 1 ] - 2.0 * u[ c ] + u[ c + 1 ] ) * hx_inv + aux[ c ] = tau * ( ( u[ c - 1 ] - 2.0 * u[ c ] + u[ c + 1 ] ) * hx_inv + ( u[ c - gridXSize ] - 2.0 * u[ c ] + u[ c + gridXSize ] ) * hy_inv ); } } template< typename Real, typename Index > __global__ void updateKernel( Real* u, const Real* aux, Real* aux, Real* cudaBlockResidue, const Index dofs ) { Index idx = blockIdx.x * blockDim.x + threadIdx.x; if( idx < dofs ) u[ idx ] += aux[ idx ]; const Index blockOffset = blockIdx.x * blockDim.x; Index idx = blockOffset + threadIdx.x; //if( idx < dofs ) if( threadIdx.x == 0 && idx < dofs ) printf( "%d %d %d -> %d \n", blockIdx.x, blockDim.x, blockOffset, idx ); // u[ idx ] += aux[ idx ]; __syncthreads(); const Index rest = dofs - blockOffset; Index n = rest < blockDim.x ? rest : blockDim.x; //computeBlockResidue< Real, Index >( aux, // cudaBlockResidue, // n ); } template< typename Real, typename Index > bool writeFunction( char* fileName, const Real* data, const Index xSize, const Index ySize, const Real& hx, const Real& hy ) { fstream file; file.open( fileName, ios::out ); if( ! file ) { cerr << "Unable to open file " << fileName << "." << endl; return false; } for( Index i = 0; i < xSize; i++ ) { for( Index j = 0; j < ySize; j++ ) file << i * hx << " " << j * hy << " " << data[ j * xSize + i ] << endl; file << endl; } } template< typename Real, typename Index > Loading @@ -86,18 +282,22 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters ) Real tau = parameters.getParameter< double >( "time-step" ); const Real finalTime = parameters.getParameter< double >( "final-time" ); const bool verbose = parameters.getParameter< bool >( "verbose" ); const Index dofsCount = gridXSize * gridYSize; dim3 cudaUpdateBlocks( dofsCount / 256 + ( dofsCount % 256 != 0 ) ); dim3 cudaUpdateBlockSize( 256 ); /**** * Initiation */ Real* u = new Real[ gridXSize * gridYSize ]; Real* aux = new Real[ gridXSize * gridYSize ]; Real* u = new Real[ dofsCount ]; Real* aux = new Real[ dofsCount ]; Real* max_du = new Real[ cudaUpdateBlocks.x ]; if( ! u || ! aux ) { cerr << "I am not able to allocate grid function for grid size " << gridXSize << "x" << gridYSize << "." << endl; return false; } const Index dofsCount = gridXSize * gridYSize; const Real hx = domainXSize / ( Real ) gridXSize; const Real hy = domainYSize / ( Real ) gridYSize; const Real hx_inv = 1.0 / ( hx * hx ); Loading @@ -121,15 +321,22 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters ) const Real y = j * hy - domainYSize / 2.0; u[ j * gridXSize + i ] = exp( - sigma * ( x * x + y * y ) ); } writeFunction( "initial", u, gridXSize, gridYSize, hx, hy ); /**** * Allocate data on the CUDA device */ Real *cuda_u, *cuda_aux; cudaMalloc( &cuda_u, gridXSize * gridYSize * sizeof( Real ) ); cudaMalloc( &cuda_aux, gridXSize * gridYSize * sizeof( Real ) ); cudaMemcpy( cuda_u, u, gridXSize * gridYSize * sizeof( Real ), cudaMemcpyHostToDevice ); int cudaErr; Real *cuda_u, *cuda_aux, *cuda_max_du; cudaMalloc( &cuda_u, dofsCount * sizeof( Real ) ); cudaMalloc( &cuda_aux, dofsCount * sizeof( Real ) ); cudaMemcpy( cuda_u, u, dofsCount * sizeof( Real ), cudaMemcpyHostToDevice ); cudaMalloc( &cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ) ); if( ( cudaErr = cudaGetLastError() ) != cudaSuccess ) { cerr << "Allocation failed. " << cudaErr << endl; return false; } /**** * Explicit Euler solver Loading @@ -138,7 +345,8 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters ) dim3 cudaBlockSize( 16, 16 ); dim3 cudaGridSize( gridXSize / 16 + ( gridXSize % 16 != 0 ), gridYSize / 16 + ( gridYSize % 16 != 0 ) ); const Index dofs = gridXSize * gridYSize; cout << "Setting grid size to " << cudaGridSize.x << "," << cudaGridSize.y << "," << cudaGridSize.z << endl; cout << "Setting block size to " << cudaBlockSize.x << "," << cudaBlockSize.y << "," << cudaBlockSize.z << endl; if( verbose ) cout << "Starting the solver main loop..." << endl; Loading @@ -155,26 +363,49 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters ) /**** * Neumann boundary conditions */ boundaryConditionsKernel<<< cudaGridSize, cudaBlockSize >>>( u, aux, gridXSize, gridYSize ); //cout << "Setting boundary conditions ... " << endl; boundaryConditionsKernel<<< cudaGridSize, cudaBlockSize >>>( cuda_u, cuda_aux, gridXSize, gridYSize ); if( ( cudaErr = cudaGetLastError() ) != cudaSuccess ) { cerr << "Setting of boundary conditions failed. " << cudaErr << endl; return false; } /**** * Laplace operator */ heatEquationKernel<<< cudaGridSize, cudaBlockSize >>> ( u, aux, tau, hx_inv, hy_inv, gridXSize, gridYSize ); cout << "Laplace operator ... " << endl; //heatEquationKernel<<< cudaGridSize, cudaBlockSize >>> // ( cuda_u, cuda_aux, tau, hx_inv, hy_inv, gridXSize, gridYSize ); if( cudaGetLastError() != cudaSuccess ) { cerr << "Laplace operator failed." << endl; return false; } /**** * Update */ updateKernel<<< dofs / 256 + ( dofs % 256 != 0 ), 256 >>>( u, aux, dofs ); /*Real absMax( 0.0 ); for( Index i = 0; i < dofsCount; i++ ) cout << "Update ... " << endl; updateKernel<<< cudaUpdateBlocks, cudaUpdateBlockSize >>>( u, aux, cuda_max_du, dofsCount ); if( cudaGetLastError() != cudaSuccess ) { const Real a = fabs( aux[ i ] ); absMax = a > absMax ? a : absMax; }*/ cerr << "Update failed." << endl; return false; } cudaThreadSynchronize(); cudaMemcpy( max_du, cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ), cudaMemcpyDeviceToHost ); if( ( cudaErr = cudaGetLastError() ) != cudaSuccess ) { cerr << "Copying max_du failed. " << cudaErr << endl; return false; } Real absMax( 0.0 ); for( Index i = 0; i < cudaUpdateBlocks.x; i++ ) { const Real a = fabs( max_du[ i ] ); absMax = a > absMax ? a : absMax; } time += currentTau; iteration++; Loading @@ -185,6 +416,8 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters ) if( verbose ) cout << endl << "Finished..." << endl; cout << "Computation time is " << timer.getTime() << " sec. i.e. " << timer.getTime() / ( double ) iteration << "sec. per iteration." << endl; cudaMemcpy( u, cuda_u, dofsCount * sizeof( Real ), cudaMemcpyDeviceToHost ); writeFunction( "final", u, gridXSize, gridYSize, hx, hy ); /*** * Freeing allocated memory Loading @@ -193,8 +426,10 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters ) cout << "Freeing allocated memory..." << endl; delete[] u; delete[] aux; delete[] max_du; cudaFree( cuda_u ); cudaFree( cuda_aux ); cudaFree( cuda_max_du ); return true; } #endif Loading Loading
src/mesh/grids/tnlTraverser_Grid2D_impl.h +7 −7 Original line number Diff line number Diff line Loading @@ -271,16 +271,16 @@ __global__ void tnlTraverserGrid2DCells( const tnlGrid< 2, Real, tnlCuda, Index const int CellDimensions = GridType::Dimensions; typename GridType::template GridEntity< CellDimensions > entity( *grid ); typedef typename GridType::CoordinatesType CoordinatesType; CoordinatesType& coordinates = entity.getCoordinates(); //CoordinatesType& coordinates = entity.getCoordinates(); const Index& xSize = grid->getDimensions().x(); const Index& ySize = grid->getDimensions().y(); /*const Index& xSize = grid->getDimensions().x(); const Index& ySize = grid->getDimensions().y();*/ coordinates.x() = ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.y() = ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; entity.getCoordinates().x() = ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; entity.getCoordinates().y() = ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( coordinates.x() < grid->getDimensions().x() && coordinates.y() < grid->getDimensions().y() ) if( entity.getCoordinates().x() < grid->getDimensions().x() && entity.getCoordinates().y() < grid->getDimensions().y() ) { entity.setIndex( grid->getEntityIndex( entity ) ); if( processAllEntities || entity.isBoundaryEntity() == processBoundaryEntities ) Loading
src/solvers/ode/tnlEulerSolver_impl.h +0 −1 Original line number Diff line number Diff line Loading @@ -158,7 +158,6 @@ bool tnlEulerSolver< Problem > :: solve( DofVectorType& u ) currentTau = Min( currentTau, this->getMaxTau() ); } } return false; }; template< typename Problem > Loading
tests/benchmarks/tnl-benchmark-simple-heat-equation.h +265 −30 Original line number Diff line number Diff line Loading @@ -28,20 +28,178 @@ using namespace std; #ifdef HAVE_CUDA template< typename Real, typename Index > __device__ void computeBlockResidue( Real* du, Real* blockResidue, Index n ) { if( n == 128 || n == 64 || n == 32 || n == 16 || n == 8 || n == 4 || n == 2 || n == 256 || n == 512 ) { if( blockDim.x >= 512 ) { if( threadIdx.x < 256 ) du[ threadIdx.x ] += du[ threadIdx.x + 256 ]; __syncthreads(); } if( blockDim.x >= 256 ) { if( threadIdx.x < 128 ) du[ threadIdx.x ] += du[ threadIdx.x + 128 ]; __syncthreads(); } if( blockDim.x >= 128 ) { if( threadIdx.x < 64 ) du[ threadIdx.x ] += du[ threadIdx.x + 64 ]; __syncthreads(); } /*** * This runs in one warp so it is synchronized implicitly. */ if ( threadIdx.x < 32) { if( blockDim.x >= 64 ) du[ threadIdx.x ] += du[ threadIdx.x + 32 ]; if( blockDim.x >= 32 ) du[ threadIdx.x ] += du[ threadIdx.x + 16 ]; if( blockDim.x >= 16 ) du[ threadIdx.x ] += du[ threadIdx.x + 8 ]; if( blockDim.x >= 8 ) du[ threadIdx.x ] += du[ threadIdx.x + 4 ]; if( blockDim.x >= 4 ) du[ threadIdx.x ] += du[ threadIdx.x + 2 ]; if( blockDim.x >= 2 ) du[ threadIdx.x ] += du[ threadIdx.x + 1 ]; } } else { int s; if( n >= 512 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; __syncthreads(); if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; __syncthreads(); } if( n >= 256 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; __syncthreads(); if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; __syncthreads(); } if( n >= 128 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; __syncthreads(); if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; __syncthreads(); } if( n >= 64 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; __syncthreads(); if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; __syncthreads(); } if( n >= 32 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; __syncthreads(); if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; __syncthreads(); } /*** * This runs in one warp so it is synchronised implicitly. */ if( n >= 16 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; } if( n >= 8 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; } if( n >= 4 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; } if( n >= 2 ) { s = n / 2; if( threadIdx.x < s ) du[ threadIdx.x ] += du[ threadIdx.x + s ]; if( 2 * s < n && threadIdx.x == n - 1 ) du[ 0 ] += du[ threadIdx.x ]; n = s; } } if( threadIdx.x == 0 ) blockResidue[ blockIdx.x ] = du[ 0 ]; } template< typename Real, typename Index > __global__ void boundaryConditionsKernel( const Real* u, Real* aux, const Index gridXSize, const Index gridYSize ) { const Index i = ( blockIdx.x ) * blockDim.x + threadIdx.x; const Index j = ( blockIdx.y ) * blockDim.y + threadIdx.y; if( i == 0 && j < gridYSize ) /*if( i == 0 && j < gridYSize ) aux[ j * gridXSize ] = 0.0; //u[ j * gridXSize + 1 ]; if( i == gridXSize - 1 && j < gridYSize ) /*if( i == gridXSize - 1 && j < gridYSize ) aux[ j * gridXSize + gridXSize - 2 ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ]; if( j == 0 && i < gridXSize ) aux[ j * gridXSize ] = 0.0; //u[ j * gridXSize + 1 ]; if( j == gridYSize -1 && i < gridXSize ) aux[ j * gridXSize + gridXSize - 2 ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ]; */ } Loading @@ -59,20 +217,58 @@ __global__ void heatEquationKernel( const Real* u, if( i > 0 && i < gridXSize - 1 && j > 0 && j < gridYSize - 1 ) { printf( "( %d, %d ) ", i, j ); const Index c = j * gridXSize + i; aux[ c ] = u[ c ] + tau * ( ( u[ c - 1 ] - 2.0 * u[ c ] + u[ c + 1 ] ) * hx_inv + aux[ c ] = tau * ( ( u[ c - 1 ] - 2.0 * u[ c ] + u[ c + 1 ] ) * hx_inv + ( u[ c - gridXSize ] - 2.0 * u[ c ] + u[ c + gridXSize ] ) * hy_inv ); } } template< typename Real, typename Index > __global__ void updateKernel( Real* u, const Real* aux, Real* aux, Real* cudaBlockResidue, const Index dofs ) { Index idx = blockIdx.x * blockDim.x + threadIdx.x; if( idx < dofs ) u[ idx ] += aux[ idx ]; const Index blockOffset = blockIdx.x * blockDim.x; Index idx = blockOffset + threadIdx.x; //if( idx < dofs ) if( threadIdx.x == 0 && idx < dofs ) printf( "%d %d %d -> %d \n", blockIdx.x, blockDim.x, blockOffset, idx ); // u[ idx ] += aux[ idx ]; __syncthreads(); const Index rest = dofs - blockOffset; Index n = rest < blockDim.x ? rest : blockDim.x; //computeBlockResidue< Real, Index >( aux, // cudaBlockResidue, // n ); } template< typename Real, typename Index > bool writeFunction( char* fileName, const Real* data, const Index xSize, const Index ySize, const Real& hx, const Real& hy ) { fstream file; file.open( fileName, ios::out ); if( ! file ) { cerr << "Unable to open file " << fileName << "." << endl; return false; } for( Index i = 0; i < xSize; i++ ) { for( Index j = 0; j < ySize; j++ ) file << i * hx << " " << j * hy << " " << data[ j * xSize + i ] << endl; file << endl; } } template< typename Real, typename Index > Loading @@ -86,18 +282,22 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters ) Real tau = parameters.getParameter< double >( "time-step" ); const Real finalTime = parameters.getParameter< double >( "final-time" ); const bool verbose = parameters.getParameter< bool >( "verbose" ); const Index dofsCount = gridXSize * gridYSize; dim3 cudaUpdateBlocks( dofsCount / 256 + ( dofsCount % 256 != 0 ) ); dim3 cudaUpdateBlockSize( 256 ); /**** * Initiation */ Real* u = new Real[ gridXSize * gridYSize ]; Real* aux = new Real[ gridXSize * gridYSize ]; Real* u = new Real[ dofsCount ]; Real* aux = new Real[ dofsCount ]; Real* max_du = new Real[ cudaUpdateBlocks.x ]; if( ! u || ! aux ) { cerr << "I am not able to allocate grid function for grid size " << gridXSize << "x" << gridYSize << "." << endl; return false; } const Index dofsCount = gridXSize * gridYSize; const Real hx = domainXSize / ( Real ) gridXSize; const Real hy = domainYSize / ( Real ) gridYSize; const Real hx_inv = 1.0 / ( hx * hx ); Loading @@ -121,15 +321,22 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters ) const Real y = j * hy - domainYSize / 2.0; u[ j * gridXSize + i ] = exp( - sigma * ( x * x + y * y ) ); } writeFunction( "initial", u, gridXSize, gridYSize, hx, hy ); /**** * Allocate data on the CUDA device */ Real *cuda_u, *cuda_aux; cudaMalloc( &cuda_u, gridXSize * gridYSize * sizeof( Real ) ); cudaMalloc( &cuda_aux, gridXSize * gridYSize * sizeof( Real ) ); cudaMemcpy( cuda_u, u, gridXSize * gridYSize * sizeof( Real ), cudaMemcpyHostToDevice ); int cudaErr; Real *cuda_u, *cuda_aux, *cuda_max_du; cudaMalloc( &cuda_u, dofsCount * sizeof( Real ) ); cudaMalloc( &cuda_aux, dofsCount * sizeof( Real ) ); cudaMemcpy( cuda_u, u, dofsCount * sizeof( Real ), cudaMemcpyHostToDevice ); cudaMalloc( &cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ) ); if( ( cudaErr = cudaGetLastError() ) != cudaSuccess ) { cerr << "Allocation failed. " << cudaErr << endl; return false; } /**** * Explicit Euler solver Loading @@ -138,7 +345,8 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters ) dim3 cudaBlockSize( 16, 16 ); dim3 cudaGridSize( gridXSize / 16 + ( gridXSize % 16 != 0 ), gridYSize / 16 + ( gridYSize % 16 != 0 ) ); const Index dofs = gridXSize * gridYSize; cout << "Setting grid size to " << cudaGridSize.x << "," << cudaGridSize.y << "," << cudaGridSize.z << endl; cout << "Setting block size to " << cudaBlockSize.x << "," << cudaBlockSize.y << "," << cudaBlockSize.z << endl; if( verbose ) cout << "Starting the solver main loop..." << endl; Loading @@ -155,26 +363,49 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters ) /**** * Neumann boundary conditions */ boundaryConditionsKernel<<< cudaGridSize, cudaBlockSize >>>( u, aux, gridXSize, gridYSize ); //cout << "Setting boundary conditions ... " << endl; boundaryConditionsKernel<<< cudaGridSize, cudaBlockSize >>>( cuda_u, cuda_aux, gridXSize, gridYSize ); if( ( cudaErr = cudaGetLastError() ) != cudaSuccess ) { cerr << "Setting of boundary conditions failed. " << cudaErr << endl; return false; } /**** * Laplace operator */ heatEquationKernel<<< cudaGridSize, cudaBlockSize >>> ( u, aux, tau, hx_inv, hy_inv, gridXSize, gridYSize ); cout << "Laplace operator ... " << endl; //heatEquationKernel<<< cudaGridSize, cudaBlockSize >>> // ( cuda_u, cuda_aux, tau, hx_inv, hy_inv, gridXSize, gridYSize ); if( cudaGetLastError() != cudaSuccess ) { cerr << "Laplace operator failed." << endl; return false; } /**** * Update */ updateKernel<<< dofs / 256 + ( dofs % 256 != 0 ), 256 >>>( u, aux, dofs ); /*Real absMax( 0.0 ); for( Index i = 0; i < dofsCount; i++ ) cout << "Update ... " << endl; updateKernel<<< cudaUpdateBlocks, cudaUpdateBlockSize >>>( u, aux, cuda_max_du, dofsCount ); if( cudaGetLastError() != cudaSuccess ) { const Real a = fabs( aux[ i ] ); absMax = a > absMax ? a : absMax; }*/ cerr << "Update failed." << endl; return false; } cudaThreadSynchronize(); cudaMemcpy( max_du, cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ), cudaMemcpyDeviceToHost ); if( ( cudaErr = cudaGetLastError() ) != cudaSuccess ) { cerr << "Copying max_du failed. " << cudaErr << endl; return false; } Real absMax( 0.0 ); for( Index i = 0; i < cudaUpdateBlocks.x; i++ ) { const Real a = fabs( max_du[ i ] ); absMax = a > absMax ? a : absMax; } time += currentTau; iteration++; Loading @@ -185,6 +416,8 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters ) if( verbose ) cout << endl << "Finished..." << endl; cout << "Computation time is " << timer.getTime() << " sec. i.e. " << timer.getTime() / ( double ) iteration << "sec. per iteration." << endl; cudaMemcpy( u, cuda_u, dofsCount * sizeof( Real ), cudaMemcpyDeviceToHost ); writeFunction( "final", u, gridXSize, gridYSize, hx, hy ); /*** * Freeing allocated memory Loading @@ -193,8 +426,10 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters ) cout << "Freeing allocated memory..." << endl; delete[] u; delete[] aux; delete[] max_du; cudaFree( cuda_u ); cudaFree( cuda_aux ); cudaFree( cuda_max_du ); return true; } #endif Loading