CUDA implemented for exact number of loops in 2D and 3D (5d1fab6e) · Commits · TNL / tnl-dev

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h

+12 −11

Original line number	Diff line number	Diff line
		@@ -68,8 +68,6 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
		__cuda_callable__ void updateCell( MeshFunctionType& u,
		const MeshEntity& cell,
		const RealType velocity = 1.0 );
		protected:

		};

		template< typename Real,
		@@ -78,7 +76,6 @@ template< typename Real,
		class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
		{
		public:

		typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
		typedef Real RealType;
		typedef Device DevcieType;
		@@ -96,11 +93,6 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
		__cuda_callable__ void updateCell( MeshFunctionType& u,
		const MeshEntity& cell,
		const RealType velocity = 1.0);

		/*Real sort( Real a, Real b, Real c,
		const RealType& ha,
		const RealType& hb,
		const RealType& hc ); */
		};

		template < typename T1, typename T2 >
		@@ -112,7 +104,8 @@ __cuda_callable__ void sortMinims( T1 pom[] );

		#ifdef HAVE_CUDA
		template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux );

		template < typename Real, typename Device, typename Index >
		@@ -120,7 +113,15 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap );

		//__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, double, TNL::Devices::Cuda, int > >& input );
		template < typename Real, typename Device, typename Index >
		__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );

		template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux );
		#endif

		#include "tnlDirectEikonalMethodsBase_impl.h"

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h

+280 −176

Original line number	Diff line number	Diff line
		@@ -11,6 +11,7 @@

		#include <iostream>
		#include "tnlFastSweepingMethod.h"

		template< typename Real,
		typename Device,
		typename Index >
		@@ -81,12 +82,6 @@ initInterface( const MeshFunctionPointer& _input,
		MeshFunctionPointer& _output,
		InterfaceMapPointer& _interfaceMap )
		{
		/*
		doplnit přepočty pro cudu:
		* overit is_same device
		* na kazdy bod jedno cuda vlakno
		*/


		if( std::is_same< Device, Devices::Cuda >::value )
		{
		@@ -102,7 +97,6 @@ initInterface( const MeshFunctionPointer& _input,
		CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
		_output.template modifyData< Device >(),
		_interfaceMap.template modifyData< Device >() );
		//CudaInitCaller<<< gridSize, blockSize >>>( input );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		#endif
		@@ -215,6 +209,7 @@ template< typename Real,
		typename Device,
		typename Index >
		template< typename MeshEntity >
		__cuda_callable__
		void
		tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
		updateCell( MeshFunctionType& u,
		@@ -300,6 +295,29 @@ tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
		initInterface( const MeshFunctionPointer& _input,
		MeshFunctionPointer& _output,
		InterfaceMapPointer& _interfaceMap )
		{
		if( std::is_same< Device, Devices::Cuda >::value )
		{
		#ifdef HAVE_CUDA
		const MeshType& mesh = _input->getMesh();

		const int cudaBlockSize( 8 );
		int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
		int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
		int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
		if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 \|\| numBlocksX > 1024 \|\| numBlocksY > 1024 \|\| numBlocksZ > 64 )
		std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
		dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
		dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
		Devices::Cuda::synchronizeDevice();
		CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
		_output.template modifyData< Device >(),
		_interfaceMap.template modifyData< Device >() );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}
		if( std::is_same< Device, Devices::Host >::value )
		{
		const MeshFunctionType& input = _input.getData();
		MeshFunctionType& output = _output.modifyData();
		@@ -452,11 +470,13 @@ initInterface( const MeshFunctionPointer& _input,
		interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245
		}
		}
		}

		template< typename Real,
		typename Device,
		typename Index >
		template< typename MeshEntity >
		__cuda_callable__
		void
		tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
		updateCell( MeshFunctionType& u,
		@@ -628,12 +648,13 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
		Cell cell( mesh );
		cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
		cell.refresh();
		const Index cind = cell.getIndex();


		output[ cell.getIndex() ] =
		output[ cind ] =
		input( cell ) >= 0 ? TypeInfo< Real >::getMaxValue() :
		- TypeInfo< Real >::getMaxValue();
		interfaceMap[ cell.getIndex() ] = false;
		interfaceMap[ cind ] = false;

		const Real& hx = mesh.getSpaceSteps().x();
		const Real& hy = mesh.getSpaceSteps().y();
		@@ -651,44 +672,127 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
		if( c * input[ n ] <= 0 )
		{
		pom = TNL::sign( c )( hy c )/( c - input[ n ]);
		if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
		output[ cell.getIndex() ] = pom;
		if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
		output[ cind ] = pom;

		interfaceMap[ cell.getIndex() ] = true;
		}
		if( c * input[ e ] <= 0 )
		{
		pom = TNL::sign( c )( hx c )/( c - input[ e ]);
		if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
		output[ cell.getIndex() ] = pom;
		if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
		output[ cind ] = pom;

		interfaceMap[ cell.getIndex() ] = true;
		interfaceMap[ cind ] = true;
		}
		if( c * input[ w ] <= 0 )
		{
		pom = TNL::sign( c )( hx c )/( c - input[ w ]);
		if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
		output[ cell.getIndex() ] = pom;
		if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
		output[ cind ] = pom;

		interfaceMap[ cell.getIndex() ] = true;
		interfaceMap[ cind ] = true;
		}
		if( c * input[ s ] <= 0 )
		{
		pom = TNL::sign( c )( hy c )/( c - input[ s ]);
		if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
		output[ cell.getIndex() ] = pom;
		if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
		output[ cind ] = pom;

		interfaceMap[ cell.getIndex() ] = true;
		interfaceMap[ cind ] = true;
		}
		}
		}
		}


		/*__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, double, TNL::Devices::Cuda, int > >& input )
		template < typename Real, typename Device, typename Index >
		__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap )
		{
		int i = threadIdx.x + blockDim.x*blockIdx.x;
		int j = blockDim.y*blockIdx.y + threadIdx.y;
		//const Meshes::Grid< 2, double, TNL::Devices::Cuda, int >& mesh = input.getMesh();
		int k = blockDim.z*blockIdx.z + threadIdx.z;
		const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();

		}*/
		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
		{
		typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
		Cell cell( mesh );
		cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
		cell.refresh();
		const Index cind = cell.getIndex();


		output[ cind ] =
		input( cell ) >= 0 ? TypeInfo< Real >::getMaxValue() :
		- TypeInfo< Real >::getMaxValue();
		interfaceMap[ cind ] = false;
		cell.refresh();

		const Real& hx = mesh.getSpaceSteps().x();
		const Real& hy = mesh.getSpaceSteps().y();
		const Real& hz = mesh.getSpaceSteps().z();
		const Real& c = input( cell );
		if( ! cell.isBoundaryEntity() )
		{
		auto neighbors = cell.getNeighborEntities();
		Real pom = 0;
		const Index e = neighbors.template getEntityIndex< 1, 0, 0 >();
		const Index w = neighbors.template getEntityIndex< -1, 0, 0 >();
		const Index n = neighbors.template getEntityIndex< 0, 1, 0 >();
		const Index s = neighbors.template getEntityIndex< 0, -1, 0 >();
		const Index t = neighbors.template getEntityIndex< 0, 0, 1 >();
		const Index b = neighbors.template getEntityIndex< 0, 0, -1 >();

		if( c * input[ n ] <= 0 )
		{
		pom = TNL::sign( c )( hy c )/( c - input[ n ]);
		if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
		output[ cind ] = pom;

		interfaceMap[ cind ] = true;
		}
		if( c * input[ e ] <= 0 )
		{
		pom = TNL::sign( c )( hx c )/( c - input[ e ]);
		if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
		output[ cind ] = pom;

		interfaceMap[ cind ] = true;
		}
		if( c * input[ w ] <= 0 )
		{
		pom = TNL::sign( c )( hx c )/( c - input[ w ]);
		if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
		output[ cind ] = pom;

		interfaceMap[ cind ] = true;
		}
		if( c * input[ s ] <= 0 )
		{
		pom = TNL::sign( c )( hy c )/( c - input[ s ]);
		if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
		output[ cind ] = pom;

		interfaceMap[ cind ] = true;
		}
		if( c * input[ b ] <= 0 )
		{
		pom = TNL::sign( c )( hz c )/( c - input[ b ]);
		if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
		output[ cind ] = pom;

		interfaceMap[ cind ] = true;
		}
		if( c * input[ t ] <= 0 )
		{
		pom = TNL::sign( c )( hz c )/( c - input[ t ]);
		if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
		output[ cind ] = pom;

		interfaceMap[ cind ] = true;
		}
		}
		}
		}

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -15,6 +15,7 @@
		#include <TNL/SharedPointer.h>
		#include "tnlDirectEikonalMethodsBase.h"


		template< typename Mesh,
		typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > >
		class FastSweepingMethod
		@@ -145,6 +146,7 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
		const IndexType maxIterations;
		};


		#include "tnlFastSweepingMethod1D_impl.h"
		#include "tnlFastSweepingMethod2D_impl.h"
		#include "tnlFastSweepingMethod3D_impl.h"

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h

+30 −19

Original line number	Diff line number	Diff line
		@@ -14,6 +14,9 @@
		#pragma once

		#include "tnlFastSweepingMethod.h"
		#include <TNL/TypeInfo.h>
		#include <TNL/Devices/Cuda.h>


		template< typename Real,
		typename Device,
		@@ -212,15 +215,23 @@ solve( const MeshPointer& mesh,
		{
		// TODO: CUDA code
		#ifdef HAVE_CUDA
		/*int numBlocks = 2;
		int threadsPerBlock;
		if( mesh->getDimensions().x() >= mesh->getDimensions().y() )
		threadsPerBlock = (int)( mesh->getDimensions().x() );
		else
		threadsPerBlock = (int)( mesh->getDimensions().y() );
		const int cudaBlockSize( 16 );
		int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
		int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
		dim3 blockSize( cudaBlockSize, cudaBlockSize );
		dim3 gridSize( numBlocksX, numBlocksY );
		Devices::Cuda::synchronizeDevice();
		int DIM = mesh->getDimensions().x();

		CudaUpdateCellCaller< Real, Device, Index ><<< numBlocks, threadsPerBlock >>>( interfaceMap, aux );
		cudaDeviceSynchronize(); //copak dela?*/
		tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
		for( int k = 0; k < numBlocksX; k++)
		CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr,
		interfaceMapPtr.template getData< Device >(),
		auxPtr.template modifyData< Device>() );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		aux = *auxPtr;
		interfaceMap = *interfaceMapPtr;
		#endif
		}
		iteration++;
		@@ -228,30 +239,30 @@ solve( const MeshPointer& mesh,
		aux.save("aux-final.tnl");
		}

		//#ifdef HAVE_CUDA
		template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux )
		{
		int i = threadIdx.x + blockDim.x*blockIdx.x;
		int j = threadIdx.y + blockDim.y*blockIdx.y;
		const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.getMesh();
		int j = blockDim.y*blockIdx.y + threadIdx.y;
		const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();

		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
		{
		//make cell of aux from index
		typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
		Cell cell( mesh );
		cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
		cell.refresh();

		//update cell value few times
		//for( int i = 0; i < mesh.getDimensions() ; i++ )
		//{
		cell.refresh();
		//tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
		for( int k = 0; k < 16; k++ )
		{
		if( ! interfaceMap( cell ) )
		{
		// tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::updateCell( aux, cell );
		ptr.updateCell( aux, cell );
		}
		}
		//}
		}
		}
		//#endif
		No newline at end of file

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h

+218 −162

Original line number	Diff line number	Diff line
		@@ -21,7 +21,7 @@ template< typename Real,
		typename Anisotropy >
		FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
		FastSweepingMethod()
		: maxIterations( 1 )
		: maxIterations( 2 )
		{

		}
		@@ -64,6 +64,7 @@ solve( const MeshPointer& mesh,
		interfaceMapPtr->setMesh( mesh );
		std::cout << "Initiating the interface cells ..." << std::endl;
		BaseType::initInterface( u, auxPtr, interfaceMapPtr );
		cudaDeviceSynchronize();
		auxPtr->save( "aux-ini.tnl" );

		typename MeshType::Cell cell( *mesh );
		@@ -72,6 +73,8 @@ solve( const MeshPointer& mesh,
		MeshFunctionType aux = *auxPtr;
		InterfaceMapType interfaceMap = * interfaceMapPtr;
		while( iteration < this->maxIterations )
		{
		if( std::is_same< DeviceType, Devices::Host >::value )
		{
		for( cell.getCoordinates().z() = 0;
		cell.getCoordinates().z() < mesh->getDimensions().z();
		@@ -237,6 +240,33 @@ solve( const MeshPointer& mesh,
		}
		}
		}
		}
		if( std::is_same< DeviceType, Devices::Cuda >::value )
		{
		// TODO: CUDA code
		#ifdef HAVE_CUDA
		const int cudaBlockSize( 8 );
		int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
		int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
		int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize );
		if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 \|\| numBlocksX > 1024 \|\| numBlocksY > 1024 \|\| numBlocksZ > 64 )
		std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
		dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
		dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
		Devices::Cuda::synchronizeDevice();

		tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
		for( int k = 0; k < numBlocksX; k++)
		CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr,
		interfaceMapPtr.template getData< Device >(),
		auxPtr.template modifyData< Device>() );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		aux = *auxPtr;
		interfaceMap = *interfaceMapPtr;
		#endif
		}

		//aux.save( "aux-8.tnl" );
		iteration++;

		@@ -244,3 +274,29 @@ solve( const MeshPointer& mesh,
		aux.save("aux-final.tnl");
		}

		template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux )
		{
		int i = threadIdx.x + blockDim.x*blockIdx.x;
		int j = blockDim.y*blockIdx.y + threadIdx.y;
		int k = blockDim.z*blockIdx.z + threadIdx.z;
		const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();

		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
		{
		typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
		Cell cell( mesh );
		cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
		cell.refresh();
		//tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
		for( int l = 0; l < 8; l++ )
		{
		if( ! interfaceMap( cell ) )
		{
		ptr.updateCell( aux, cell );
		}
		}
		}
		}
		No newline at end of file