Commit 5b42f5bc authored by Vít Hanousek's avatar Vít Hanousek

Merge branch 'mpi' into anselm-special-log

parents 393c5ba5 6987efa0
......@@ -409,10 +409,6 @@ endif()
# endif()
#endif()
if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" )
AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " )
endif()
CONFIGURE_FILE( "tnlConfig.h.in" "${PROJECT_BUILD_PATH}/TNL/tnlConfig.h" )
INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/tnlConfig.h DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY} )
......
......@@ -34,7 +34,6 @@ INSTANTIATE_INT="yes"
INSTANTIATE_LONG_DOUBLE="no"
INSTANTIATE_DOUBLE="yes"
INSTANTIATE_FLOAT="no"
OPTIMIZED_VECTOR_HOST_OPERATIONS="no"
for option in "$@"
do
......@@ -75,7 +74,6 @@ do
INSTANTIATE_DOUBLE="yes"
INSTANTIATE_FLOAT="no"
WITH_CUDA_ARCH="auto" ;;
--optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;;
* )
echo "Unknown option ${option}. Use --help for more information."
exit 1 ;;
......@@ -175,7 +173,6 @@ cmake_command=(
-DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE}
-DINSTANTIATE_INT=${INSTANTIATE_INT}
-DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT}
-DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS}
)
# Skip running cmake if it was already run and the cmake command is the same.
......
......@@ -87,23 +87,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
auto maxHost = [&]() {
resultHost = hostVector.max();
};
auto maxHostGeneral = [&]() {
Real result( 0 );
Containers::Algorithms::ParallelReductionMax< Real > operation;
Containers::Algorithms::Reduction< Devices::Host >::reduce(
operation,
hostVector.getSize(),
hostVector.getData(),
( Real* ) 0,
result );
return result;
};
auto maxCuda = [&]() {
resultDevice = deviceVector.max();
};
benchmark.setOperation( "max", datasetSize );
benchmark.time( reset1, "CPU", maxHost );
benchmark.time( reset1, "CPU (general)", maxHostGeneral );
#ifdef HAVE_CUDA
benchmark.time( reset1, "GPU", maxCuda );
#endif
......@@ -112,23 +100,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
auto minHost = [&]() {
resultHost = hostVector.min();
};
auto minHostGeneral = [&]() {
Real result( 0 );
Containers::Algorithms::ParallelReductionMin< Real > operation;
Containers::Algorithms::Reduction< Devices::Host >::reduce(
operation,
hostVector.getSize(),
hostVector.getData(),
( Real* ) 0,
result );
return result;
};
auto minCuda = [&]() {
resultDevice = deviceVector.min();
};
benchmark.setOperation( "min", datasetSize );
benchmark.time( reset1, "CPU", minHost );
benchmark.time( reset1, "CPU (general)", minHostGeneral );
#ifdef HAVE_CUDA
benchmark.time( reset1, "GPU", minCuda );
#endif
......@@ -137,17 +113,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
auto absMaxHost = [&]() {
resultHost = hostVector.absMax();
};
auto absMaxHostGeneral = [&]() {
Real result( 0 );
Containers::Algorithms::ParallelReductionAbsMax< Real > operation;
Containers::Algorithms::Reduction< Devices::Host >::reduce(
operation,
hostVector.getSize(),
hostVector.getData(),
( Real* ) 0,
result );
return result;
};
auto absMaxCuda = [&]() {
resultDevice = deviceVector.absMax();
};
......@@ -162,7 +127,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
benchmark.setOperation( "absMax", datasetSize );
benchmark.time( reset1, "CPU", absMaxHost );
benchmark.time( reset1, "CPU (general)", absMaxHostGeneral );
#ifdef HAVE_CUDA
benchmark.time( reset1, "GPU", absMaxCuda );
benchmark.time( reset1, "cuBLAS", absMaxCublas );
......@@ -172,17 +136,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
auto absMinHost = [&]() {
resultHost = hostVector.absMin();
};
auto absMinHostGeneral = [&]() {
Real result( 0 );
Containers::Algorithms::ParallelReductionAbsMin< Real > operation;
Containers::Algorithms::Reduction< Devices::Host >::reduce(
operation,
hostVector.getSize(),
hostVector.getData(),
( Real* ) 0,
result );
return result;
};
auto absMinCuda = [&]() {
resultDevice = deviceVector.absMin();
};
......@@ -197,7 +150,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
benchmark.setOperation( "absMin", datasetSize );
benchmark.time( reset1, "CPU", absMinHost );
benchmark.time( reset1, "CPU (general)", absMinHostGeneral );
#ifdef HAVE_CUDA
benchmark.time( reset1, "GPU", absMinCuda );
benchmark.time( reset1, "cuBLAS", absMinCublas );
......@@ -207,23 +159,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
auto sumHost = [&]() {
resultHost = hostVector.sum();
};
auto sumHostGeneral = [&]() {
Real result( 0 );
Containers::Algorithms::ParallelReductionSum< Real > operation;
Containers::Algorithms::Reduction< Devices::Host >::reduce(
operation,
hostVector.getSize(),
hostVector.getData(),
( Real* ) 0,
result );
return result;
};
auto sumCuda = [&]() {
resultDevice = deviceVector.sum();
};
benchmark.setOperation( "sum", datasetSize );
benchmark.time( reset1, "CPU", sumHost );
benchmark.time( reset1, "CPU (general)", sumHostGeneral );
#ifdef HAVE_CUDA
benchmark.time( reset1, "GPU", sumCuda );
#endif
......@@ -232,17 +172,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
auto l1normHost = [&]() {
resultHost = hostVector.lpNorm( 1.0 );
};
auto l1normHostGeneral = [&]() {
Real result( 0 );
Containers::Algorithms::ParallelReductionAbsSum< Real > operation;
Containers::Algorithms::Reduction< Devices::Host >::reduce(
operation,
hostVector.getSize(),
hostVector.getData(),
( Real* ) 0,
result );
return result;
};
auto l1normCuda = [&]() {
resultDevice = deviceVector.lpNorm( 1.0 );
};
......@@ -255,7 +184,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
benchmark.setOperation( "l1 norm", datasetSize );
benchmark.time( reset1, "CPU", l1normHost );
benchmark.time( reset1, "CPU (general)", l1normHostGeneral );
#ifdef HAVE_CUDA
benchmark.time( reset1, "GPU", l1normCuda );
benchmark.time( reset1, "cuBLAS", l1normCublas );
......@@ -265,17 +193,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
auto l2normHost = [&]() {
resultHost = hostVector.lpNorm( 2.0 );
};
auto l2normHostGeneral = [&]() {
Real result( 0 );
Containers::Algorithms::ParallelReductionL2Norm< Real > operation;
Containers::Algorithms::Reduction< Devices::Host >::reduce(
operation,
hostVector.getSize(),
hostVector.getData(),
( Real* ) 0,
result );
return result;
};
auto l2normCuda = [&]() {
resultDevice = deviceVector.lpNorm( 2.0 );
};
......@@ -288,7 +205,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
benchmark.setOperation( "l2 norm", datasetSize );
benchmark.time( reset1, "CPU", l2normHost );
benchmark.time( reset1, "CPU (general)", l2normHostGeneral );
#ifdef HAVE_CUDA
benchmark.time( reset1, "GPU", l2normCuda );
benchmark.time( reset1, "cuBLAS", l2normCublas );
......@@ -298,24 +214,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
auto l3normHost = [&]() {
resultHost = hostVector.lpNorm( 3.0 );
};
auto l3normHostGeneral = [&]() {
Real result( 0 );
Containers::Algorithms::ParallelReductionLpNorm< Real > operation;
operation.setPower( 3.0 );
Containers::Algorithms::Reduction< Devices::Host >::reduce(
operation,
hostVector.getSize(),
hostVector.getData(),
( Real* ) 0,
result );
return result;
};
auto l3normCuda = [&]() {
resultDevice = deviceVector.lpNorm( 3.0 );
};
benchmark.setOperation( "l3 norm", datasetSize );
benchmark.time( reset1, "CPU", l3normHost );
benchmark.time( reset1, "CPU (general)", l3normHostGeneral );
#ifdef HAVE_CUDA
benchmark.time( reset1, "GPU", l3normCuda );
#endif
......@@ -324,17 +227,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
auto scalarProductHost = [&]() {
resultHost = hostVector.scalarProduct( hostVector2 );
};
auto scalarProductHostGeneral = [&]() {
Real result( 0 );
Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation;
Containers::Algorithms::Reduction< Devices::Host >::reduce(
operation,
hostVector.getSize(),
hostVector.getData(),
hostVector2.getData(),
result );
return result;
};
auto scalarProductCuda = [&]() {
resultDevice = deviceVector.scalarProduct( deviceVector2 );
};
......@@ -348,7 +240,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
benchmark.setOperation( "scalar product", 2 * datasetSize );
benchmark.time( reset1, "CPU", scalarProductHost );
benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral );
#ifdef HAVE_CUDA
benchmark.time( reset1, "GPU", scalarProductCuda );
benchmark.time( reset1, "cuBLAS", scalarProductCublas );
......
......@@ -514,7 +514,7 @@ getExplicitUpdate( const RealType& time,
cell.getBasis(),
gridXIdx,
gridYIdx );
cudaThreadSynchronize();
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
//std::cerr << "Computing the heat equation ..." << std::endl;
......@@ -534,7 +534,7 @@ getExplicitUpdate( const RealType& time,
cell.getBasis(),
gridXIdx,
gridYIdx );
cudaThreadSynchronize();
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
}
if( this->cudaKernelType == "templated" )
......
......@@ -47,7 +47,7 @@ int main( int argc, char* argv[] )
while( iteration < 10000 )
{
testKernel< GridEntity ><<< cudaGridSize, cudaBlockSize >>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
iteration++;
}
auto t_stop = std::chrono::high_resolution_clock::now();
......
......@@ -379,7 +379,7 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters,
return false;
}
cudaThreadSynchronize();
cudaDeviceSynchronize();
cudaMemcpy( max_du, cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ), cudaMemcpyDeviceToHost );
if( ( cudaErr = cudaGetLastError() ) != cudaSuccess )
{
......
......@@ -250,7 +250,7 @@ double benchmarkMatrix( const Matrix& matrix,
matrix.vectorProduct( x, b );
#ifdef HAVE_CUDA
if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value )
cudaThreadSynchronize();
cudaDeviceSynchronize();
#endif
time = timer.getRealTime();
iterations++;
......
......@@ -60,7 +60,7 @@ class CusparseCSRBase
void vectorProduct( const InVector& inVector,
OutVector& outVector ) const
{
TNL_ASSERT( matrix, );
TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
#ifdef HAVE_CUDA
cusparseDcsrmv( *( this->cusparseHandle ),
CUSPARSE_OPERATION_NON_TRANSPOSE,
......@@ -103,7 +103,7 @@ class CusparseCSR< double > : public CusparseCSRBase< double >
void vectorProduct( const InVector& inVector,
OutVector& outVector ) const
{
TNL_ASSERT( matrix, "" );
TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
#ifdef HAVE_CUDA
double d = 1.0;
double* alpha = &d;
......@@ -134,7 +134,7 @@ class CusparseCSR< float > : public CusparseCSRBase< float >
void vectorProduct( const InVector& inVector,
OutVector& outVector ) const
{
TNL_ASSERT( matrix, "" );
TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
#ifdef HAVE_CUDA
float d = 1.0;
float* alpha = &d;
......
......@@ -51,6 +51,8 @@ void export_Matrix( py::module & m, const char* name )
using VectorType = TNL::Containers::Vector< typename Matrix::RealType, typename Matrix::DeviceType, typename Matrix::IndexType >;
void (Matrix::* _getCompressedRowLengths)(typename Matrix::CompressedRowLengthsVector&) const = &Matrix::getCompressedRowLengths;
auto matrix = py::class_< Matrix, TNL::Object >( m, name )
.def(py::init<>())
// overloads (defined in Object)
......@@ -69,7 +71,7 @@ void export_Matrix( py::module & m, const char* name )
.def("setDimensions", &Matrix::setDimensions)
.def("setCompressedRowLengths", &Matrix::setCompressedRowLengths)
.def("getRowLength", &Matrix::getRowLength)
.def("getCompressedRowLengths", &Matrix::getCompressedRowLengths)
.def("getCompressedRowLengths", _getCompressedRowLengths)
// TODO: export for more types
.def("setLike", &Matrix::template setLike< typename Matrix::RealType, typename Matrix::DeviceType, typename Matrix::IndexType >)
.def("getNumberOfMatrixElements", &Matrix::getNumberOfMatrixElements)
......
......@@ -14,8 +14,6 @@ ADD_SUBDIRECTORY( Pointers )
ADD_SUBDIRECTORY( Problems )
ADD_SUBDIRECTORY( Solvers )
ADD_SUBDIRECTORY( legacy )
SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL )
set( headers
......@@ -57,8 +55,6 @@ set( tnl_SOURCES ${tnl_config_SOURCES}
${tnl_pointers_SOURCES}
${tnl_solvers_SOURCES}
${tnl_legacy_SOURCES}
${common_SOURCES} )
set( tnl_CUDA__SOURCES ${tnl_config_CUDA__SOURCES}
......@@ -73,7 +69,6 @@ set( tnl_CUDA__SOURCES ${tnl_config_CUDA__SOURCES}
${tnl_problems_CUDA__SOURCES}
${tnl_solvers_CUDA__SOURCES}
${tnl_legacy_CUDA__SOURCES}
${common_SOURCES} )
......
......@@ -314,12 +314,12 @@ class MpiCommunicator
}
template< typename T >
static void Bcast( T& data, int count, int root,CommunicationGroup group)
static void Bcast( T* data, int count, int root, CommunicationGroup group)
{
#ifdef HAVE_MPI
TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
TNL_ASSERT_NE(group, NullGroup, "BCast cannot be called with NullGroup");
MPI_Bcast((void*) &data, count, MPIDataType(data), root, group);
MPI_Bcast((void*) data, count, MPIDataType(data), root, group);
#else
throw Exceptions::MPISupportMissing();
#endif
......@@ -340,6 +340,21 @@ class MpiCommunicator
#endif
}
// in-place variant of Allreduce
template< typename T >
static void Allreduce( T* data,
int count,
const MPI_Op &op,
CommunicationGroup group)
{
#ifdef HAVE_MPI
TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup");
MPI_Allreduce( MPI_IN_PLACE, (void*) data,count,MPIDataType(data),op,group);
#else
throw Exceptions::MPISupportMissing();
#endif
}
template< typename T >
static void Reduce( const T* data,
......
......@@ -93,8 +93,8 @@ class NoDistrCommunicator
{
}
template< typename T >
static void Bcast( T& data, int count, int root, CommunicationGroup group)
template< typename T >
static void Bcast( T* data, int count, int root, CommunicationGroup group)
{
}
......@@ -108,6 +108,15 @@ class NoDistrCommunicator
memcpy( ( void* ) reduced_data, ( const void* ) data, count * sizeof( T ) );
}
// in-place variant of Allreduce
template< typename T >
static void Allreduce( T* data,
int count,
const MPI_Op &op,
CommunicationGroup group )
{
}
template< typename T >
static void Reduce( T* data,
T* reduced_data,
......
......@@ -42,14 +42,14 @@ class ArrayOperations< Devices::Host >
static Element getMemoryElement( const Element* data );
template< typename Element, typename Index >
static bool setMemory( Element* data,
static void setMemory( Element* data,
const Element& value,
const Index size );
template< typename DestinationElement,
typename SourceElement,
typename Index >
static bool copyMemory( DestinationElement* destination,
static void copyMemory( DestinationElement* destination,
const SourceElement* source,
const Index size );
......@@ -93,14 +93,14 @@ class ArrayOperations< Devices::Cuda >
static Element getMemoryElement( const Element* data );
template< typename Element, typename Index >
static bool setMemory( Element* data,
static void setMemory( Element* data,
const Element& value,
const Index size );
template< typename DestinationElement,
typename SourceElement,
typename Index >
static bool copyMemory( DestinationElement* destination,
static void copyMemory( DestinationElement* destination,
const SourceElement* source,
const Index size );
......@@ -132,7 +132,7 @@ class ArrayOperations< Devices::Cuda, Devices::Host >
template< typename DestinationElement,
typename SourceElement,
typename Index >
static bool copyMemory( DestinationElement* destination,
static void copyMemory( DestinationElement* destination,
const SourceElement* source,
const Index size );
......@@ -152,7 +152,7 @@ class ArrayOperations< Devices::Host, Devices::Cuda >
template< typename DestinationElement,
typename SourceElement,
typename Index >
static bool copyMemory( DestinationElement* destination,
static void copyMemory( DestinationElement* destination,
const SourceElement* source,
const Index size );
......@@ -185,14 +185,14 @@ class ArrayOperations< Devices::MIC >
static Element getMemoryElement( const Element* data );
template< typename Element, typename Index >
static bool setMemory( Element* data,
static void setMemory( Element* data,
const Element& value,
const Index size );
template< typename DestinationElement,
typename SourceElement,
typename Index >
static bool copyMemory( DestinationElement* destination,
static void copyMemory( DestinationElement* destination,
const SourceElement* source,
const Index size );
......@@ -224,7 +224,7 @@ class ArrayOperations< Devices::MIC, Devices::Host >
template< typename DestinationElement,
typename SourceElement,
typename Index >
static bool copyMemory( DestinationElement* destination,
static void copyMemory( DestinationElement* destination,
const SourceElement* source,
const Index size );
......@@ -244,7 +244,7 @@ class ArrayOperations< Devices::Host, Devices::MIC >
template< typename DestinationElement,
typename SourceElement,
typename Index >
static bool copyMemory( DestinationElement* destination,
static void copyMemory( DestinationElement* destination,
const SourceElement* source,
const Index size );
......
......@@ -78,7 +78,7 @@ getMemoryElement( const Element* data )
}
template< typename Element, typename Index >
bool
void
ArrayOperations< Devices::MIC >::
setMemory( Element* data,
const Element& value,
......@@ -95,7 +95,6 @@ setMemory( Element* data,
for(int i=0;i<size;i++)
dst[i]=tmp;
}
return true;
#else
throw Exceptions::MICSupportMissing();
#endif
......@@ -104,7 +103,7 @@ setMemory( Element* data,
template< typename DestinationElement,
typename SourceElement,
typename Index >
bool
void
ArrayOperations< Devices::MIC >::
copyMemory( DestinationElement* destination,
const SourceElement* source,
......@@ -123,7 +122,6 @@ copyMemory( DestinationElement* destination,
{
memcpy(dst_ptr.pointer,src_ptr.pointer,size*sizeof(DestinationElement));
}
return true;
}
else
{
......@@ -136,13 +134,10 @@ copyMemory( DestinationElement* destination,
for(int i=0;i<size;i++)
dst_ptr.pointer[i]=src_ptr.pointer[i];
}
return true;
}
#else
throw Exceptions::MICSupportMissing();
#endif
return false;
}
template< typename Element1,
......@@ -242,7 +237,7 @@ containsOnlyValue( const Element* data,
template< typename DestinationElement,
typename SourceElement,
typename Index >
bool
void
ArrayOperations< Devices::Host, Devices::MIC >::
copyMemory( DestinationElement* destination,
const SourceElement* source,
......@@ -267,7 +262,6 @@ copyMemory( DestinationElement* destination,
}
memcpy((void*)destination,(void*)&tmp,size*sizeof(SourceElement));
return true;
}
else
{
......@@ -277,7 +271,6 @@ copyMemory( DestinationElement* destination,
{
memcpy((void*)tmp,src_ptr.pointer,size*sizeof(SourceElement));
}
return true;
}
}
else
......@@ -297,7 +290,6 @@ copyMemory( DestinationElement* destination,
}
memcpy((void*)destination,(void*)&tmp,size*sizeof(DestinationElement));
return true;
}
else
{
......@@ -309,10 +301,8 @@ copyMemory( DestinationElement* destination,
for(int i=0;i<size;i++)
dst[i]=src_ptr.pointer[i];
}
return true;
}
}
return false;
#else
throw Exceptions::MICSupportMissing();
#endif
......@@ -368,7 +358,7 @@ compareMemory( const Element1* destination,
template< typename DestinationElement,
typename SourceElement,
typename Index >
bool
void
ArrayOperations< Devices::MIC, Devices::Host >::
copyMemory( DestinationElement* destination,
const SourceElement* source,
......@@ -393,8 +383,6 @@ copyMemory( DestinationElement* destination,
{
memcpy(dst_ptr.pointer,(void*)&tmp,size*sizeof(SourceElement));
}
return true;
}
else
{
......@@ -404,7 +392,6 @@ copyMemory( DestinationElement* destination,
{
memcpy(dst_ptr.pointer,(void*)tmp,size*sizeof(SourceElement));
}
return true;
}
}
else
......@@ -423,7 +410,6 @@ copyMemory( DestinationElement* destination,
for(int i=0;i<size;i++)
dst_ptr.pointer[i]=src[i];
}
return true;
}
else
{
......@@ -435,10 +421,8 @@ copyMemory( DestinationElement* destination,
for(int i=0;i<size;i++)
dst_ptr.pointer[i]=src[i];
}
return true;