Skip to content
Snippets Groups Projects
Commit ac305460 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Fixed passing of Arrays by ArrayView.

parent 008601ad
No related branches found
No related tags found
1 merge request!43Hamilton jacobi rebase
...@@ -353,8 +353,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, ...@@ -353,8 +353,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
template < typename Index > template < typename Index >
__global__ void GetNeighbours( const TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator, __global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY ) TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY )
{ {
int i = blockIdx.x * 1024 + threadIdx.x; int i = blockIdx.x * 1024 + threadIdx.x;
...@@ -389,7 +389,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< ...@@ -389,7 +389,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator, TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
const Containers::StaticVector< 2, Index > vecLowerOverlaps, const Containers::StaticVector< 2, Index > vecLowerOverlaps,
const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock ) const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock )
{ {
...@@ -598,7 +598,7 @@ tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: ...@@ -598,7 +598,7 @@ tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
updateBlocks( InterfaceMapType interfaceMap, updateBlocks( InterfaceMapType interfaceMap,
MeshFunctionType aux, MeshFunctionType aux,
MeshFunctionType helpFunc, MeshFunctionType helpFunc,
ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) ArrayContainerView BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
{ {
#pragma omp parallel for schedule( dynamic ) #pragma omp parallel for schedule( dynamic )
for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
...@@ -769,7 +769,7 @@ template< typename Real, ...@@ -769,7 +769,7 @@ template< typename Real,
typename Index > typename Index >
void void
tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ) getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY )
{ {
int* BlockIterPom; int* BlockIterPom;
BlockIterPom = new int [numBlockX * numBlockY]; BlockIterPom = new int [numBlockX * numBlockY];
......
...@@ -480,8 +480,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3 ...@@ -480,8 +480,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
template < typename Index > template < typename Index >
__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
int numBlockX, int numBlockY, int numBlockZ ) int numBlockX, int numBlockY, int numBlockZ )
{ {
int i = blockIdx.x * 1024 + threadIdx.x; int i = blockIdx.x * 1024 + threadIdx.x;
...@@ -520,7 +520,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< ...@@ -520,7 +520,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps ) Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps )
{ {
int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
...@@ -1056,7 +1056,7 @@ template< typename Real, ...@@ -1056,7 +1056,7 @@ template< typename Real,
typename Index > typename Index >
void void
tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ) getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
{ {
int* BlockIterPom; int* BlockIterPom;
BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ]; BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ];
......
...@@ -62,6 +62,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ...@@ -62,6 +62,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType > MeshFunctionType;
typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType; typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
using ArrayContainerView = typename ArrayContainer::ViewType;
typedef Containers::StaticVector< 2, Index > StaticVector; typedef Containers::StaticVector< 2, Index > StaticVector;
using MeshPointer = Pointers::SharedPointer< MeshType >; using MeshPointer = Pointers::SharedPointer< MeshType >;
...@@ -87,15 +88,18 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ...@@ -87,15 +88,18 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
const RealType velocity = 1.0 ); const RealType velocity = 1.0 );
// FOR OPENMP WILL BE REMOVED // FOR OPENMP WILL BE REMOVED
void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ); void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY );
template< int sizeSArray > template< int sizeSArray >
void updateBlocks( const InterfaceMapType& interfaceMap, void updateBlocks( InterfaceMapType interfaceMap,
MeshFunctionType& aux, MeshFunctionType aux,
MeshFunctionType& helpFunc, MeshFunctionType helpFunc,
ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ); ArrayContainerView BlockIterHost, int numThreadsPerBlock );
protected:
void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY ); __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[],
const RealType originalValue, const RealType v );
}; };
template< typename Real, template< typename Real,
...@@ -111,6 +115,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ...@@ -111,6 +115,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType > MeshFunctionType;
typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType; typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
using ArrayContainerView = typename ArrayContainer::ViewType;
typedef Containers::StaticVector< 3, Index > StaticVector; typedef Containers::StaticVector< 3, Index > StaticVector;
using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
...@@ -134,15 +139,15 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ...@@ -134,15 +139,15 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
const RealType velocity = 1.0 ); const RealType velocity = 1.0 );
// OPENMP WILL BE REMOVED // OPENMP WILL BE REMOVED
void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ); void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
template< int sizeSArray > template< int sizeSArray >
void updateBlocks( const InterfaceMapType& interfaceMap, void updateBlocks( const InterfaceMapType interfaceMap,
const MeshFunctionType& aux, const MeshFunctionType aux,
MeshFunctionType& helpFunc, MeshFunctionType& helpFunc,
ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ); ArrayContainer BlockIterHost, int numThreadsPerBlock );
void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ); protected:
__cuda_callable__ RealType getNewValue( RealType valuesAndSteps[], __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[],
const RealType originalValue, const RealType v ); const RealType originalValue, const RealType v );
...@@ -180,17 +185,14 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< ...@@ -180,17 +185,14 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator, TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
const Containers::StaticVector< 2, Index > vecLowerOverlaps, const Containers::StaticVector< 2, Index > vecLowerOverlaps,
const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock =0); const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock =0);
template < typename Index > template < typename Index >
__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, __global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks ); TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY );
template < typename Index >
__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
// 3D // 3D
...@@ -205,10 +207,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< ...@@ -205,10 +207,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice ); TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps );
template < typename Index > template < typename Index >
__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
int numBlockX, int numBlockY, int numBlockZ ); int numBlockX, int numBlockY, int numBlockZ );
#endif #endif
......
...@@ -80,40 +80,9 @@ solve( const MeshPointer& mesh, ...@@ -80,40 +80,9 @@ solve( const MeshPointer& mesh,
IndexType iteration( 0 ); IndexType iteration( 0 );
InterfaceMapType interfaceMap = *interfaceMapPtr; InterfaceMapType interfaceMap = *interfaceMapPtr;
MeshFunctionType aux = *auxPtr; MeshFunctionType aux = *auxPtr;
aux.template synchronize< Communicator >(); aux.template synchronize< Communicator >(); //synchronize initialized overlaps
#ifdef HAVE_MPI
int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup );
//printf( "Hello world from rank: %d ", i );
//Communicators::MpiCommunicator::Request r = Communicators::MpiCommunicator::ISend( auxPtr, 0, 0, Communicators::MpiCommunicator::AllGroup );
if( i == 1 ) {
/*for( int k = 0; k < 16*16; k++ )
aux[ k ] = 10;*/
printf( "1: mesh x: %d\n", mesh->getDimensions().x() );
printf( "1: mesh y: %d\n", mesh->getDimensions().y() );
//aux.save("aux_proc1.tnl");
}
if( i == 0 ) {
printf( "0: mesh x: %d\n", mesh->getDimensions().x() );
printf( "0: mesh y: %d\n", mesh->getDimensions().y() );
//aux.save("aux_proc0.tnl");
/*for( int k = 0; k < mesh->getDimensions().x()*mesh->getDimensions().y(); k++ )
aux[ k ] = 10;
for( int k = 0; k < mesh->getDimensions().x(); k++ ){
for( int l = 0; l < mesh->getDimensions().y(); l++ )
printf("%f.2\t",aux[ k * 16 + l ] );
printf("\n");
}*/
}
/*bool a = Communicators::MpiCommunicator::IsInitialized();
if( a )
printf("Je Init\n");
else
printf("Neni Init\n");*/
#endif
std::cout << "Calculating the values ..." << std::endl;
while( iteration < this->maxIterations ) while( iteration < this->maxIterations )
{ {
// calculatedBefore indicates weather we calculated in the last passage of the while cycle // calculatedBefore indicates weather we calculated in the last passage of the while cycle
...@@ -290,41 +259,8 @@ solve( const MeshPointer& mesh, ...@@ -290,41 +259,8 @@ solve( const MeshPointer& mesh,
// Need for calling functions from kernel // Need for calling functions from kernel
BaseType ptr; BaseType ptr;
int BlockIterD = 1; // True if we should calculate again.
/*auxPtr = helpFunc; int calculateCudaBlocksAgain = 1;
CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
interfaceMapPtr.template getData< Device >(),
auxPtr.template getData< Device>(),
helpFunc.template modifyData< Device>(),
BlockIterDevice,
oddEvenBlock.getView() );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
auxPtr = helpFunc;
oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
interfaceMapPtr.template getData< Device >(),
auxPtr.template getData< Device>(),
helpFunc.template modifyData< Device>(),
BlockIterDevice,
oddEvenBlock.getView() );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
auxPtr = helpFunc;
oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
BlockIterD = dBlock.getElement( 0 );*/
// Array that identifies which blocks should be calculated. // Array that identifies which blocks should be calculated.
// All blocks should calculate in first passage ( setValue(1) ) // All blocks should calculate in first passage ( setValue(1) )
...@@ -343,16 +279,9 @@ solve( const MeshPointer& mesh, ...@@ -343,16 +279,9 @@ solve( const MeshPointer& mesh,
MeshFunctionPointer helpFunc( mesh ); MeshFunctionPointer helpFunc( mesh );
helpFunc.template modifyData() = auxPtr.template getData(); helpFunc.template modifyData() = auxPtr.template getData();
//int pocBloku = 0; // number of iterations of while calculateCudaBlocksAgain
Devices::Cuda::synchronizeDevice(); int numIter = 0;
CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
interfaceMapPtr.template getData< Device >(),
auxPtr.template modifyData< Device>(),
helpFunc.template modifyData< Device>(),
BlockIterDevice.getView() );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
//int oddEvenBlock = 0; //int oddEvenBlock = 0;
while( calculateCudaBlocksAgain ) while( calculateCudaBlocksAgain )
{ {
...@@ -390,44 +319,16 @@ solve( const MeshPointer& mesh, ...@@ -390,44 +319,16 @@ solve( const MeshPointer& mesh,
Devices::Cuda::synchronizeDevice(); Devices::Cuda::synchronizeDevice();
CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(),
auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(),
blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps ); blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps );
cudaDeviceSynchronize(); cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE;
cudaDeviceSynchronize(); // Switching helpFunc and auxPtr.
TNL_CHECK_CUDA_DEVICE; auxPtr.swap( helpFunc );
aux = *auxPtr;
interfaceMap = *interfaceMapPtr;
#endif
}
/**----------------------MPI-TO-DO---------------------------------------------**/
#ifdef HAVE_MPI
//int i = MPI::GetRank( MPI::AllGroup );
//TNL::Meshes::DistributedMeshes::DistributedMesh< MeshType > Mesh;
int neighCount = 0; // should this thread calculate again?
int calculpom[4] = {0,0,0,0};
if( i == 0 ){
BlockIterPom1 = BlockIterDevice;
for( int i =0; i< numBlocksX; i++ ){
for( int j = 0; j < numBlocksY; j++ )
{
std::cout << BlockIterPom1[j*numBlocksX + i];
}
std::cout << std::endl;
}
std::cout << std::endl;
}
#endif
// Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now.
Devices::Cuda::synchronizeDevice(); Devices::Cuda::synchronizeDevice();
GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator, blockCalculationIndicatorHelp, numBlocksX, numBlocksY ); GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY );
cudaDeviceSynchronize(); cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE;
blockCalculationIndicator = blockCalculationIndicatorHelp; blockCalculationIndicator = blockCalculationIndicatorHelp;
...@@ -445,46 +346,24 @@ solve( const MeshPointer& mesh, ...@@ -445,46 +346,24 @@ solve( const MeshPointer& mesh,
/**-----------------------------------------------------------------------------------------------------------*/ /**-----------------------------------------------------------------------------------------------------------*/
numIter ++; numIter ++;
} }
if( numIter%2 == 1 ){ if( numIter%2 == 1 ) // Need to check parity for MPI overlaps to synchronize ( otherwise doesnt work )
auxPtr = helpFunc;
}
/*cudaFree( BlockIterDevice );
cudaFree( dBlock );
delete BlockIter;*/
if( neigh[1] != -1 )
{
req[neighCount] = MPI::ISend( &calculated, 1, neigh[1], 0, MPI::AllGroup );
neighCount++;
req[neighCount] = MPI::IRecv( &calculpom[1], 1, neigh[1], 0, MPI::AllGroup );
neighCount++;
}
if( neigh[2] != -1 )
{
req[neighCount] = MPI::ISend( &calculated, 1, neigh[2], 0, MPI::AllGroup );
neighCount++;
req[neighCount] = MPI::IRecv( &calculpom[2], 1, neigh[2], 0, MPI::AllGroup );
neighCount++;
}
if( neigh[5] != -1 )
{ {
req[neighCount] = MPI::ISend( &calculated, 1, neigh[5], 0, MPI::AllGroup ); helpFunc.swap( auxPtr );
neighCount++; Devices::Cuda::synchronizeDevice();
cudaDeviceSynchronize();
req[neighCount] = MPI::IRecv( &calculpom[3], 1, neigh[5], 0, MPI::AllGroup ); TNL_CHECK_CUDA_DEVICE;
neighCount++;
} }
aux = *auxPtr;
MPI::WaitAll(req,neighCount); interfaceMap = *interfaceMapPtr;
#if ForDebug #endif
printf( "%d: Sending Calculated = %d.\n", i, calculated ); }
#endif
MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR, MPI::AllGroup );
/**----------------------MPI-TO-DO---------------------------------------------**/
#ifdef HAVE_MPI
if( CommunicatorType::isDistributed() ){
getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh );
aux.template synchronize< Communicator >(); aux.template synchronize< Communicator >();
} }
#endif #endif
...@@ -518,9 +397,16 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, ...@@ -518,9 +397,16 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
#endif #endif
} }
template < typename Index >
__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY )
template< typename Real, typename Device, typename Index,
typename Communicator, typename Anisotropy >
bool
FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
const AnisotropyPointer& anisotropy )
{ {
bool calculated = false; bool calculated = false;
const MeshType& mesh = aux.getMesh(); const MeshType& mesh = aux.getMesh();
...@@ -548,97 +434,15 @@ __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, I ...@@ -548,97 +434,15 @@ __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, I
return calculated; return calculated;
} }
template < typename Index >
__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks )
{
int i = threadIdx.x;
int blId = blockIdx.x;
int blockSize = blockDim.x;
/*if ( i == 0 && blId == 0 ){
printf( "nBlocks = %d\n", nBlocks );
for( int j = nBlocks-1; j > -1 ; j--){
printf( "%d: cislo = %d \n", j, BlockIterDevice[ j ] );
}
}*/
__shared__ int sArray[ 1024 ];
sArray[ i ] = 0;
if( blId * 1024 + i < nBlocks )
sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
__syncthreads();
/*if ( i == 0 && blId == 0 ){
printf( "nBlocks = %d\n", nBlocks );
for( int j = 4; j > -1 ; j--){
printf( "%d: cislo = %d \n", j, sArray[ j ] );
}
}*/
/*extern __shared__ volatile int sArray[];
unsigned int i = threadIdx.x;
unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sArray[ i ] = 0;
while( gid < nBlocks )
{
sArray[ i ] += BlockIterDevice[ gid ] + BlockIterDevice[ gid + blockSize ];
gid += gridSize;
}
__syncthreads();*/
if ( blockSize == 1024) {
if (i < 512)
sArray[ i ] += sArray[ i + 512 ];
}
__syncthreads();
if (blockSize >= 512) {
if (i < 256) {
sArray[ i ] += sArray[ i + 256 ];
}
}
__syncthreads();
if (blockSize >= 256) {
if (i < 128) {
sArray[ i ] += sArray[ i + 128 ];
}
}
__syncthreads();
if (blockSize >= 128) {
if (i < 64) {
sArray[ i ] += sArray[ i + 64 ];
}
}
__syncthreads();
if (i < 32 )
{
if( blockSize >= 64 ){ sArray[ i ] += sArray[ i + 32 ];}
__syncthreads();
if( blockSize >= 32 ){ sArray[ i ] += sArray[ i + 16 ];}
__syncthreads();
if( blockSize >= 16 ){ sArray[ i ] += sArray[ i + 8 ];}
if( blockSize >= 8 ){ sArray[ i ] += sArray[ i + 4 ];}
__syncthreads();
if( blockSize >= 4 ){ sArray[ i ] += sArray[ i + 2 ];}
__syncthreads();
if( blockSize >= 2 ){ sArray[ i ] += sArray[ i + 1 ];}
__syncthreads();
}
__syncthreads();
if( i == 0 )
dBlock[ blId ] = sArray[ 0 ];
}
template < int sizeSArray, typename Real, typename Device, typename Index > #ifdef HAVE_MPI
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, template< typename Real, typename Device, typename Index,
const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, typename Communicator, typename Anisotropy >
const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, void
Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) ); getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
TNL_CHECK_CUDA_DEVICE;
CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
TNL_CHECK_CUDA_DEVICE;
{ {
Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
...@@ -687,4 +491,3 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< ...@@ -687,4 +491,3 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
calculateFromNeighbours[2] || calculateFromNeighbours[3]; calculateFromNeighbours[2] || calculateFromNeighbours[3];
} }
#endif #endif
TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock )
...@@ -262,24 +262,86 @@ solve( const MeshPointer& mesh, ...@@ -262,24 +262,86 @@ solve( const MeshPointer& mesh,
// IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2) // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2)
const int cudaBlockSize( 8 ); const int cudaBlockSize( 8 );
CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr, // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps)
interfaceMapPtr.template getData< Device >(), int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
auxPtr.template getData< Device>(), int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
helpFunc.template modifyData< Device>(), int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize );
BlockIterDevice.getView() ); if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
cudaDeviceSynchronize(); std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
TNL_CHECK_CUDA_DEVICE;
GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ ); // Making the variables for global function CudaUpdateCellCaller.
cudaDeviceSynchronize(); dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
TNL_CHECK_CUDA_DEVICE; dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
BlockIterDevice = BlockIterPom;
CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY * numBlocksZ ) ); BaseType ptr; // tnlDirectEikonalMethodBase type for calling of function inside CudaUpdateCellCaller
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
int BlockIterD = 1; //variable that tells us weather we should calculate the main cuda body again
// Array containing information about each block in grid, answering question (Have we calculated in this block?)
TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice( numBlocksX * numBlocksY * numBlocksZ );
BlockIterDevice.setValue( 1 ); // calculate all in the first passage
// Helping Array for GetNeighbours3D
TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom( numBlocksX * numBlocksY * numBlocksZ );
BlockIterPom.setValue( 0 ); //doesnt matter what number
// number of neighbours in one block (1024 threads) for GetNeighbours3D
int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0);
//MeshFunctionPointer helpFunc1( mesh );
MeshFunctionPointer helpFunc( mesh );
helpFunc.template modifyData() = auxPtr.template getData();
Devices::Cuda::synchronizeDevice();
int numIter = 0; // number of passages of following while cycle
CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); while( BlockIterD ) //main body of cuda code
{
Devices::Cuda::synchronizeDevice();
// main function that calculates all values in each blocks
// calculated values are in helpFunc
CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
interfaceMapPtr.template getData< Device >(),
auxPtr.template getData< Device>(),
helpFunc.template modifyData< Device>(),
BlockIterDevice.getView(), vecLowerOverlaps, vecUpperOverlaps );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
// Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr
auxPtr.swap( helpFunc );
Devices::Cuda::synchronizeDevice();
// Neighbours of blocks that calculatedBefore in this passage should calculate in the next!
// BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours)
GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
BlockIterDevice = BlockIterPom;
Devices::Cuda::synchronizeDevice();
// .containsValue(1) is actually parallel reduction implemented in TNL
BlockIterD = BlockIterDevice.containsValue(1);
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
numIter++;
if( BlockIterD ){
// if we calculated in this passage, we should send the info via MPI so neighbours should calculate after synchronization
calculatedBefore = 1;
}
}
if( numIter%2 == 1 ){
// We need auxPtr to point on memory of original auxPtr (not to helpFunc)
// last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values
auxPtr.swap( helpFunc );
Devices::Cuda::synchronizeDevice();
}
cudaDeviceSynchronize(); cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE;
aux = *auxPtr; aux = *auxPtr;
...@@ -375,10 +437,15 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, ...@@ -375,10 +437,15 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
return calculated; return calculated;
} }
template < typename Index >
__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
int numBlockX, int numBlockY, int numBlockZ ) #ifdef HAVE_MPI
template< typename Real, typename Device, typename Index,
typename Communicator, typename Anisotropy >
void
FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
{ {
Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
...@@ -397,22 +464,6 @@ __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, ...@@ -397,22 +464,6 @@ __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda,
requestsInformation[neighCount++] = requestsInformation[neighCount++] =
MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup ); MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup );
} }
}
template < int sizeSArray, typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice )
{
int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
int i = threadIdx.x + blockDim.x*blockIdx.x + vLower[0]; // WITH OVERLAPS!!! i,j,k aren't coordinates of all values
int j = blockDim.y*blockIdx.y + threadIdx.y + vLower[1];
int k = blockDim.z*blockIdx.z + threadIdx.z + vLower[2];
int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
if( neighbours[1] != -1 ) // EAST if( neighbours[1] != -1 ) // EAST
{ {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment