Commit 399f9627 authored by Jakub Klinkovský's avatar Jakub Klinkovský

Moved algorithms from TNL/Containers/Algorithms/ to just TNL/Algorithms/

The usage of algorithms such as MemoryOperations or Reduction is not
bound to a particular container. On the other hand, ArrayIO,
ArrayAssignment, VectorAssignment and StaticArrayAssignment are just
implementation details for the containers - moved into
TNL/Containers/detail/

Also moved ParallelFor, StaticFor, StaticVectorFor, TemplateStaticFor
into TNL/Algorithms/
parent 57db358c
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Algorithms/Reduction.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
bool comparison( const Vector< double, Device >& u, const Vector< double, Device >& v )
......
......@@ -4,7 +4,7 @@
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
void scan( Vector< double, Device >& v )
......
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Algorithms/Reduction.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
double mapReduce( Vector< double, Device >& u )
......
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Algorithms/Reduction.h>
#include <TNL/Timer.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
double mapReduce( Vector< double, Device >& u )
......
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Algorithms/Reduction.h>
#include <TNL/Timer.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
double mapReduce( Vector< double, Device >& u )
......
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Algorithms/Reduction.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
double maximumNorm( const Vector< double, Device >& v )
......
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Algorithms/Reduction.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
double product( const Vector< double, Device >& v )
......
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Algorithms/Reduction.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
std::pair< int, double >
......
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Algorithms/Reduction.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
double scalarProduct( const Vector< double, Device >& u, const Vector< double, Device >& v )
......
......@@ -4,7 +4,7 @@
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
void scan( Vector< double, Device >& v )
......
......@@ -4,7 +4,7 @@
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
void segmentedScan( Vector< double, Device >& v, Vector< bool, Device >& flags )
......
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Algorithms/Reduction.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
double sum( const Vector< double, Device >& v )
......
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Algorithms/Reduction.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
using namespace TNL::Algorithms;
template< typename Device >
double updateAndResidue( Vector< double, Device >& u, const Vector< double, Device >& delta_u, const double& tau )
......
......@@ -10,7 +10,7 @@
#pragma once
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Algorithms/Reduction.h>
#include "CommonVectorOperations.h"
namespace TNL {
......@@ -30,7 +30,7 @@ getVectorMax( const Vector& v )
const auto* data = v.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}
template< typename Device >
......@@ -47,7 +47,7 @@ getVectorMin( const Vector& v )
const auto* data = v.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}
template< typename Device >
......@@ -64,7 +64,7 @@ getVectorAbsMax( const Vector& v )
const auto* data = v.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}
template< typename Device >
......@@ -81,7 +81,7 @@ getVectorAbsMin( const Vector& v )
const auto* data = v.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}
template< typename Device >
......@@ -97,7 +97,7 @@ getVectorL1Norm( const Vector& v )
const auto* data = v.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}
template< typename Device >
......@@ -113,7 +113,7 @@ getVectorL2Norm( const Vector& v )
const auto* data = v.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
return std::sqrt( Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
}
template< typename Device >
......@@ -136,7 +136,7 @@ getVectorLpNorm( const Vector& v,
const auto* data = v.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
return std::pow( Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
}
template< typename Device >
......@@ -155,7 +155,7 @@ getVectorSum( const Vector& v )
const auto* data = v.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}
template< typename Device >
......@@ -175,7 +175,7 @@ getVectorDifferenceMax( const Vector1& v1,
const auto* data2 = v2.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}
template< typename Device >
......@@ -195,7 +195,7 @@ getVectorDifferenceMin( const Vector1& v1,
const auto* data2 = v2.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}
template< typename Device >
......@@ -215,7 +215,7 @@ getVectorDifferenceAbsMax( const Vector1& v1,
const auto* data2 = v2.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}
template< typename Device >
......@@ -235,7 +235,7 @@ getVectorDifferenceAbsMin( const Vector1& v1,
const auto* data2 = v2.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}
template< typename Device >
......@@ -254,7 +254,7 @@ getVectorDifferenceL1Norm( const Vector1& v1,
const auto* data1 = v1.getData();
const auto* data2 = v2.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}
template< typename Device >
......@@ -276,7 +276,7 @@ getVectorDifferenceL2Norm( const Vector1& v1,
auto diff = data1[ i ] - data2[ i ];
return diff * diff;
};
return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
return std::sqrt( Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
}
template< typename Device >
......@@ -302,7 +302,7 @@ getVectorDifferenceLpNorm( const Vector1& v1,
const auto* data1 = v1.getData();
const auto* data2 = v2.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
return std::pow( Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
}
template< typename Device >
......@@ -321,7 +321,7 @@ getVectorDifferenceSum( const Vector1& v1,
const auto* data1 = v1.getData();
const auto* data2 = v2.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}
template< typename Device >
......@@ -340,7 +340,7 @@ getScalarProduct( const Vector1& v1,
const auto* data1 = v1.getData();
const auto* data2 = v2.getData();
auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}
} // namespace Benchmarks
......
......@@ -12,7 +12,7 @@
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/ParallelFor.h>
#include <TNL/Algorithms/ParallelFor.h>
namespace TNL {
namespace Benchmarks {
......@@ -104,9 +104,9 @@ struct VectorOperations< Devices::Cuda >
auto add2 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ]; };
if( thisMultiplicator == 1.0 )
ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _y.getSize(), add1 );
Algorithms::ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _y.getSize(), add1 );
else
ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _y.getSize(), add2 );
Algorithms::ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _y.getSize(), add2 );
}
template< typename Vector1, typename Vector2, typename Vector3, typename Scalar1, typename Scalar2, typename Scalar3 >
......@@ -131,9 +131,9 @@ struct VectorOperations< Devices::Cuda >
auto add2 = [=] __cuda_callable__ ( IndexType i ) { v[ i ] = thisMultiplicator * v[ i ] + multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ]; };
if( thisMultiplicator == 1.0 )
ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _v.getSize(), add1 );
Algorithms::ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _v.getSize(), add1 );
else
ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _v.getSize(), add2 );
Algorithms::ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _v.getSize(), add2 );
}
};
......
......@@ -73,7 +73,7 @@ benchmarkTriad( Benchmark & benchmark,
{
a_v[i] = b_v[i] + scalar * c_v[i];
};
ParallelFor< Devices::Cuda >::exec( (long) 0, size, kernel );
Algorithms::ParallelFor< Devices::Cuda >::exec( (long) 0, size, kernel );
a_h = a_d;
};
......@@ -117,7 +117,7 @@ benchmarkTriad( Benchmark & benchmark,
{
a_v[i] = b_v[i] + scalar * c_v[i];
};
ParallelFor< Devices::Cuda >::exec( (long) 0, size, kernel );
Algorithms::ParallelFor< Devices::Cuda >::exec( (long) 0, size, kernel );
a_h = a_d;
};
......@@ -150,7 +150,7 @@ benchmarkTriad( Benchmark & benchmark,
};
auto triad = [&]()
{
ParallelFor< Devices::Cuda >::exec( (long) 0, size, kernel );
Algorithms::ParallelFor< Devices::Cuda >::exec( (long) 0, size, kernel );
};
benchmark.time< Devices::Cuda >( reset, "zero-copy", triad );
......@@ -181,7 +181,7 @@ benchmarkTriad( Benchmark & benchmark,
};
auto triad = [&]()
{
ParallelFor< Devices::Cuda >::exec( (long) 0, size, kernel );
Algorithms::ParallelFor< Devices::Cuda >::exec( (long) 0, size, kernel );
};
benchmark.time< Devices::Cuda >( reset, "unified memory", triad );
......
......@@ -578,13 +578,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
////
// Exclusive prefix sum
auto exclusivePrefixSumHost = [&]() {
hostVector.template prefixSum< Containers::Algorithms::ScanType::Exclusive >();
hostVector.template prefixSum< Algorithms::ScanType::Exclusive >();
};
benchmark.setOperation( "exclusive prefix sum", 2 * datasetSize );
benchmark.time< Devices::Host >( reset1, "CPU ET", exclusivePrefixSumHost );
#ifdef HAVE_CUDA
auto exclusivePrefixSumCuda = [&]() {
deviceVector.template prefixSum< Containers::Algorithms::ScanType::Exclusive >();
deviceVector.template prefixSum< Algorithms::ScanType::Exclusive >();
};
benchmark.time< Devices::Cuda >( reset1, "GPU ET", exclusivePrefixSumCuda );
#endif
......
......@@ -14,7 +14,7 @@
#include <TNL/Assert.h>
#include <TNL/Math.h>
#include <TNL/ParallelFor.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Containers/NDArray.h>
#include <TNL/Containers/ndarray/Operations.h>
......@@ -98,7 +98,7 @@ void benchmark_array( Benchmark& benchmark, index_type size = 500000000 )
};
auto f = [&]() {
TNL::ParallelFor< Device >::exec( 0, (int) size, kernel, a.getData(), b.getData() );
Algorithms::ParallelFor< Device >::exec( 0, (int) size, kernel, a.getData(), b.getData() );
};
// warm-up for all benchmarks
......
......@@ -13,7 +13,7 @@
#pragma once
#include <TNL/Devices/Host.h>
#include <TNL/ParallelFor.h>
#include <TNL/Algorithms/ParallelFor.h>
namespace TNL {
namespace Benchmarks {
......@@ -43,7 +43,7 @@ struct SimpleProblem
{
fu[ i ] = 1.0;
};
ParallelFor< DeviceType >::exec( ( IndexType ) 0, u.getSize(), computeF, u, fu );
Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, u.getSize(), computeF, u, fu );
}
template< typename Vector >
......
......@@ -12,7 +12,7 @@
#pragma once
#include <TNL/ParallelFor.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Containers/Vector.h>
......
......@@ -12,7 +12,7 @@
#pragma once
#include <TNL/ParallelFor.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Containers/Vector.h>
......
......@@ -12,7 +12,7 @@
#pragma once
#include <TNL/ParallelFor.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Containers/Vector.h>
......
......@@ -12,7 +12,7 @@
#pragma once
#include <TNL/ParallelFor.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Containers/Vector.h>
......
......@@ -19,7 +19,7 @@
#include <TNL/Config/ConfigDescription.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/ParallelFor.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Containers/List.h>
using namespace TNL;
......
......@@ -16,11 +16,10 @@
#include <TNL/Math.h>
#include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Cuda/SharedMemory.h>
#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
#include <TNL/Algorithms/CudaReductionBuffer.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
namespace TNL {
namespace Containers {
namespace Algorithms {
#ifdef HAVE_CUDA
......@@ -282,5 +281,4 @@ CudaMultireductionKernelLauncher( const Result zero,
}
} // namespace Algorithms
} // namespace Containers
} // namespace TNL
......@@ -19,7 +19,6 @@
#include <TNL/Exceptions/CudaSupportMissing.h>
namespace TNL {
namespace Containers {
namespace Algorithms {
class CudaReductionBuffer
......@@ -92,5 +91,4 @@ class CudaReductionBuffer
};
} // namespace Algorithms
} // namespace Containers
} // namespace TNL
......@@ -16,12 +16,11 @@
#include <TNL/Math.h>
#include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Cuda/SharedMemory.h>
#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
#include <TNL/Containers/Algorithms/MultiDeviceMemoryOperations.h>
#include <TNL/Algorithms/CudaReductionBuffer.h>
#include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
namespace TNL {
namespace Containers {
namespace Algorithms {
/****
......@@ -615,5 +614,4 @@ struct CudaReductionKernelLauncher
};
} // namespace Algorithms
} // namespace Containers
} // namespace TNL
......@@ -18,7 +18,6 @@
#include <TNL/Containers/Array.h>
namespace TNL {
namespace Containers {
namespace Algorithms {
#ifdef HAVE_CUDA
......@@ -249,7 +248,7 @@ struct CudaScanKernelLauncher
//std::cerr << "numberOfgrids = " << numberOfGrids << std::endl;
// allocate array for the block sums
Array< Real, Devices::Cuda > blockSums;
Containers::Array< Real, Devices::Cuda > blockSums;
blockSums.setSize( numberOfBlocks );
// loop over all grids
......@@ -388,5 +387,4 @@ struct CudaScanKernelLauncher
#endif
} // namespace Algorithms
} // namespace Containers
} // namespace TNL
......@@ -12,11 +12,10 @@
#pragma once
#include <TNL/Containers/Algorithms/Scan.h>
#include <TNL/Algorithms/Scan.h>
#include <TNL/Containers/Vector.h>
namespace TNL {
namespace Containers {
namespace Algorithms {
template< ScanType Type >
......@@ -51,7 +50,7 @@ struct DistributedScan
const int nproc = CommunicatorType::GetSize( group );
RealType dataForScatter[ nproc ];
for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum;
Vector< RealType, Devices::Host > rankSums( nproc );
Containers::Vector< RealType, Devices::Host > rankSums( nproc );
// NOTE: exchanging general data types does not work with MPI
CommunicatorType::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group );
......@@ -66,5 +65,4 @@ struct DistributedScan
};
} // namespace Algorithms
} // namespace Containers
} // namespace TNL
......@@ -15,7 +15,6 @@
#include <TNL/Cuda/CudaCallable.h>
namespace TNL {
namespace Containers {
namespace Algorithms {
template< typename DestinationExecution >
......@@ -180,9 +179,8 @@ struct MemoryOperations< Devices::Cuda >
};
} // namespace Algorithms
} // namespace Containers
} // namespace TNL
#include <TNL/Containers/Algorithms/MemoryOperationsSequential.hpp>
#include <TNL/Containers/Algorithms/MemoryOperationsHost.hpp>
#include <TNL/Containers/Algorithms/MemoryOperationsCuda.hpp>
#include <TNL/Algorithms/MemoryOperationsSequential.hpp>
#include <TNL/Algorithms/MemoryOperationsHost.hpp>
#include <TNL/Algorithms/MemoryOperationsCuda.hpp>
......@@ -14,14 +14,13 @@
#include <memory> // std::unique_ptr
#include <stdexcept>
#include <TNL/Containers/Algorithms/MemoryOperations.h>
#include <TNL/Containers/Algorithms/MultiDeviceMemoryOperations.h>
#include <TNL/ParallelFor.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Algorithms/MemoryOperations.h>
#include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Algorithms/Reduction.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
namespace TNL {
namespace Containers {
namespace Algorithms {
template< typename Element >
......@@ -156,5 +155,4 @@ containsOnlyValue( const Element* data,
}
} // namespace Algorithms
} // namespace Containers