Skip to content
Snippets Groups Projects
Commit be8f2ac4 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber Committed by Jakub Klinkovský
Browse files

GridTraverser_impl.h splitted into GridTraverser_1D.hpp, GridTraverser_2D.hpp...

GridTraverser_impl.h splitted into GridTraverser_1D.hpp, GridTraverser_2D.hpp and GridTraverser_3D.hpp.
parent b4a904e4
No related branches found
No related tags found
1 merge request!20Traversers optimizations
......@@ -74,7 +74,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
endif()
# set Debug/Release options
set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable" )
set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -g" )
set( CMAKE_CXX_FLAGS_DEBUG "-g" )
set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" )
#set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" )
......@@ -229,7 +229,7 @@ if( ${WITH_CUDA} )
endif()
endif()
endif()
set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES )
set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES --generate-line-info)
# TODO: this is necessary only due to a bug in cmake
set( CUDA_ADD_LIBRARY_OPTIONS -shared )
endif()
......
/***************************************************************************
GridTraverser_1D.hpp - description
-------------------
begin : Jan 4, 2019
copyright : (C) 2019 by Tomas Oberhuber
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
// Implemented by: Tomas Oberhuber,
// Jakub Klinkovsky,
// Vit Hanousek
#pragma once
#include <TNL/Devices/MIC.h>
#include <TNL/Meshes/Grid.h>
#include <TNL/Pointers/SharedPointer.h>
#include <TNL/CudaStreamPool.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Meshes/GridDetails/GridTraverser.h>
namespace TNL {
namespace Meshes {
/****
* 1D traverser, host
*/
template< typename Real,
typename Index >
template<
typename GridEntity,
typename EntitiesProcessor,
typename UserData,
bool processOnlyBoundaryEntities >
void
GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >::
processEntities(
const GridPointer& gridPointer,
const CoordinatesType begin,
const CoordinatesType end,
UserData& userData,
const int& stream )
{
GridEntity entity( *gridPointer );
if( processOnlyBoundaryEntities )
{
GridEntity entity( *gridPointer );
entity.getCoordinates() = begin;
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
entity.getCoordinates() = end;
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
else
{
#ifdef HAVE_OPENMP
if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
{
#pragma omp parallel firstprivate( begin, end )
{
GridEntity entity( *gridPointer );
#pragma omp for
// TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow
for( IndexType x = begin.x(); x <= end.x(); x++ )
{
entity.getCoordinates().x() = x;
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
}
}
else
{
GridEntity entity( *gridPointer );
for( entity.getCoordinates().x() = begin.x();
entity.getCoordinates().x() <= end.x();
entity.getCoordinates().x() ++ )
{
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
}
#else
GridEntity entity( *gridPointer );
for( entity.getCoordinates().x() = begin.x();
entity.getCoordinates().x() <= end.x();
entity.getCoordinates().x() ++ )
{
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
#endif
}
}
/****
* 1D traverser, CUDA
*/
#ifdef HAVE_CUDA
template< typename Real,
typename Index,
typename GridEntity,
typename UserData,
typename EntitiesProcessor >
__global__ void
GridTraverser1D(
const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
UserData userData,
const typename GridEntity::CoordinatesType begin,
const typename GridEntity::CoordinatesType end,
const Index gridIdx )
{
typedef Real RealType;
typedef Index IndexType;
typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
typename GridType::CoordinatesType coordinates;
coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
if( coordinates <= end )
{
GridEntity entity( *grid, coordinates );
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
}
template< typename Real,
typename Index,
typename GridEntity,
typename UserData,
typename EntitiesProcessor >
__global__ void
GridBoundaryTraverser1D(
const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
UserData userData,
const typename GridEntity::CoordinatesType begin,
const typename GridEntity::CoordinatesType end )
{
typedef Real RealType;
typedef Index IndexType;
typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
typename GridType::CoordinatesType coordinates;
if( threadIdx.x == 0 )
{
coordinates.x() = begin.x();
GridEntity entity( *grid, coordinates );
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
if( threadIdx.x == 1 )
{
coordinates.x() = end.x();
GridEntity entity( *grid, coordinates );
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
}
#endif
template< typename Real,
typename Index >
template<
typename GridEntity,
typename EntitiesProcessor,
typename UserData,
bool processOnlyBoundaryEntities >
void
GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >::
processEntities(
const GridPointer& gridPointer,
const CoordinatesType& begin,
const CoordinatesType& end,
UserData& userData,
const int& stream )
{
#ifdef HAVE_CUDA
auto& pool = CudaStreamPool::getInstance();
const cudaStream_t& s = pool.getStream( stream );
Devices::Cuda::synchronizeDevice();
if( processOnlyBoundaryEntities )
{
dim3 cudaBlockSize( 2 );
dim3 cudaBlocks( 1 );
GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
<<< cudaBlocks, cudaBlockSize, 0, s >>>
( &gridPointer.template getData< Devices::Cuda >(),
userData,
begin,
end );
}
else
{
dim3 cudaBlockSize( 256 );
dim3 cudaBlocks;
cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
<<< cudaBlocks, cudaBlockSize, 0, s >>>
( &gridPointer.template getData< Devices::Cuda >(),
userData,
begin,
end,
gridXIdx );
}
// only launches into the stream 0 are synchronized
/*if( stream == 0 )
{
cudaStreamSynchronize( s );
TNL_CHECK_CUDA_DEVICE;
}*/
#else
throw Exceptions::CudaSupportMissing();
#endif
}
/****
* 1D traverser, MIC
*/
template< typename Real,
typename Index >
template<
typename GridEntity,
typename EntitiesProcessor,
typename UserData,
bool processOnlyBoundaryEntities >
void
GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >::
processEntities(
const GridPointer& gridPointer,
const CoordinatesType& begin,
const CoordinatesType& end,
UserData& userData,
const int& stream )
{
std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl;
/*
auto& pool = CudaStreamPool::getInstance();
const cudaStream_t& s = pool.getStream( stream );
Devices::Cuda::synchronizeDevice();
if( processOnlyBoundaryEntities )
{
dim3 cudaBlockSize( 2 );
dim3 cudaBlocks( 1 );
GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
<<< cudaBlocks, cudaBlockSize, 0, s >>>
( &gridPointer.template getData< Devices::Cuda >(),
userData,
begin,
end );
}
else
{
dim3 cudaBlockSize( 256 );
dim3 cudaBlocks;
cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
<<< cudaBlocks, cudaBlockSize, 0, s >>>
( &gridPointer.template getData< Devices::Cuda >(),
userData,
begin,
end,
gridXIdx );
}
// only launches into the stream 0 are synchronized
if( stream == 0 )
{
cudaStreamSynchronize( s );
TNL_CHECK_CUDA_DEVICE;
}
*/
}
} // namespace Meshes
} // namespace TNL
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment