Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Commits on Source (10)
Showing
with 2817 additions and 54 deletions
......@@ -2,6 +2,7 @@
/Debug
/Release
/Testing
/Documentation
/CMakeLists.txt.user
/doc/_build
/Build
......
This diff is collapsed.
......@@ -223,6 +223,11 @@ else
make_target="all"
fi
# make expects VERBOSE=1, ninja expects -v
if [[ "$make" != "make" ]] && [[ "$VERBOSE" ]]; then
VERBOSE="-v"
fi
$make ${VERBOSE} $make_target
if [[ ${WITH_TESTS} == "yes" ]]; then
......
#!/bin/bash
PREFIX=${HOME}/.local
INSTALL="yes"
for option in "$@"
do
case $option in
--prefix=* ) PREFIX="${option#*=}" ;;
--install=* ) INSTALL="${option#*=}" ;;
esac
done
doxygen
if [[ "$INSTALL" == "yes" ]]; then
cp -r Documentation/* ${PREFIX}/share/doc
fi
......@@ -514,7 +514,7 @@ getExplicitUpdate( const RealType& time,
cell.getBasis(),
gridXIdx,
gridYIdx );
cudaThreadSynchronize();
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
//std::cerr << "Computing the heat equation ..." << std::endl;
......@@ -534,7 +534,7 @@ getExplicitUpdate( const RealType& time,
cell.getBasis(),
gridXIdx,
gridYIdx );
cudaThreadSynchronize();
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
}
if( this->cudaKernelType == "templated" )
......
......@@ -47,7 +47,7 @@ int main( int argc, char* argv[] )
while( iteration < 10000 )
{
testKernel< GridEntity ><<< cudaGridSize, cudaBlockSize >>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
iteration++;
}
auto t_stop = std::chrono::high_resolution_clock::now();
......
......@@ -379,7 +379,7 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters,
return false;
}
cudaThreadSynchronize();
cudaDeviceSynchronize();
cudaMemcpy( max_du, cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ), cudaMemcpyDeviceToHost );
if( ( cudaErr = cudaGetLastError() ) != cudaSuccess )
{
......
......@@ -250,7 +250,7 @@ double benchmarkMatrix( const Matrix& matrix,
matrix.vectorProduct( x, b );
#ifdef HAVE_CUDA
if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value )
cudaThreadSynchronize();
cudaDeviceSynchronize();
#endif
time = timer.getRealTime();
iterations++;
......
# enable C++14 for pytnl (due to py::overload_cast)
set(PYBIND11_CPP_STANDARD -std=c++14)
set( sources
Grid1D.cpp
Grid2D.cpp
......
......@@ -179,7 +179,7 @@ reduce( Operation& operation,
typedef typename Operation::DataType2 DataType2;
typedef typename Operation::ResultType ResultType;
const int block_size = 128;
constexpr int block_size = 128;
const int blocks = size / block_size;
#ifdef HAVE_OPENMP
......@@ -194,17 +194,23 @@ reduce( Operation& operation,
}
// initialize array for thread-local results
ResultType r[ n ];
for( int k = 0; k < n; k++ )
// (it is accessed as a row-major matrix with n rows and 4 columns)
ResultType r[ n * 4 ];
for( int k = 0; k < n * 4; k++ )
r[ k ] = operation.initialValue();
#pragma omp for nowait
for( int b = 0; b < blocks; b++ ) {
const int offset = b * block_size;
const IndexType offset = b * block_size;
for( int k = 0; k < n; k++ ) {
const DataType1* _input1 = input1 + k * ldInput1;
for( IndexType i = 0; i < block_size; i++ )
operation.firstReduction( r[ k ], offset + i, _input1, input2 );
ResultType* _r = r + 4 * k;
for( int i = 0; i < block_size; i += 4 ) {
operation.firstReduction( _r[ 0 ], offset + i, _input1, input2 );
operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 );
operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 );
operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 );
}
}
}
......@@ -213,36 +219,88 @@ reduce( Operation& operation,
{
for( int k = 0; k < n; k++ ) {
const DataType1* _input1 = input1 + k * ldInput1;
ResultType* _r = r + 4 * k;
for( IndexType i = blocks * block_size; i < size; i++ )
operation.firstReduction( r[ k ], i, _input1, input2 );
operation.firstReduction( _r[ 0 ], i, _input1, input2 );
}
}
// local reduction of unrolled results
for( int k = 0; k < n; k++ ) {
ResultType* _r = r + 4 * k;
operation.commonReduction( _r[ 0 ], _r[ 1 ] );
operation.commonReduction( _r[ 0 ], _r[ 2 ] );
operation.commonReduction( _r[ 0 ], _r[ 3 ] );
}
// inter-thread reduction of local results
#pragma omp critical
{
for( int k = 0; k < n; k++ )
operation.commonReduction( result[ k ], r[ k ] );
operation.commonReduction( result[ k ], r[ 4 * k ] );
}
}
else {
#endif
for( int k = 0; k < n; k++ )
result[ k ] = operation.initialValue();
if( blocks > 1 ) {
// initialize array for unrolled results
// (it is accessed as a row-major matrix with n rows and 4 columns)
ResultType r[ n * 4 ];
for( int k = 0; k < n * 4; k++ )
r[ k ] = operation.initialValue();
// main reduction (explicitly unrolled loop)
for( int b = 0; b < blocks; b++ ) {
const IndexType offset = b * block_size;
for( int k = 0; k < n; k++ ) {
const DataType1* _input1 = input1 + k * ldInput1;
ResultType* _r = r + 4 * k;
for( int i = 0; i < block_size; i += 4 ) {
operation.firstReduction( _r[ 0 ], offset + i, _input1, input2 );
operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 );
operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 );
operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 );
}
}
}
for( int b = 0; b < blocks; b++ ) {
const int offset = b * block_size;
// reduction of the last, incomplete block (not unrolled)
for( int k = 0; k < n; k++ ) {
const DataType1* _input1 = input1 + k * ldInput1;
for( IndexType i = 0; i < block_size; i++ )
operation.firstReduction( result[ k ], offset + i, _input1, input2 );
ResultType* _r = r + 4 * k;
for( IndexType i = blocks * block_size; i < size; i++ )
operation.firstReduction( _r[ 0 ], i, _input1, input2 );
}
// reduction of unrolled results
for( int k = 0; k < n; k++ ) {
ResultType* _r = r + 4 * k;
operation.commonReduction( _r[ 0 ], _r[ 1 ] );
operation.commonReduction( _r[ 0 ], _r[ 2 ] );
operation.commonReduction( _r[ 0 ], _r[ 3 ] );
// copy the result into the output parameter
result[ k ] = _r[ 0 ];
}
}
else {
for( int k = 0; k < n; k++ )
result[ k ] = operation.initialValue();
for( int k = 0; k < n; k++ ) {
const DataType1* _input1 = input1 + k * ldInput1;
for( IndexType i = blocks * block_size; i < size; i++ )
operation.firstReduction( result[ k ], i, _input1, input2 );
for( int b = 0; b < blocks; b++ ) {
const IndexType offset = b * block_size;
for( int k = 0; k < n; k++ ) {
const DataType1* _input1 = input1 + k * ldInput1;
for( int i = 0; i < block_size; i++ )
operation.firstReduction( result[ k ], offset + i, _input1, input2 );
}
}
for( int k = 0; k < n; k++ ) {
const DataType1* _input1 = input1 + k * ldInput1;
for( IndexType i = blocks * block_size; i < size; i++ )
operation.firstReduction( result[ k ], i, _input1, input2 );
}
}
#ifdef HAVE_OPENMP
}
......
......@@ -179,22 +179,22 @@ reduce( Operation& operation,
typedef typename Operation::DataType2 DataType2;
typedef typename Operation::ResultType ResultType;
#ifdef HAVE_OPENMP
constexpr int block_size = 128;
const int blocks = size / block_size;
#ifdef HAVE_OPENMP
if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
// global result variable
ResultType result = operation.initialValue();
#pragma omp parallel
{
const int blocks = size / block_size;
// initialize array for thread-local results
ResultType r[ 4 ] = { operation.initialValue() };
#pragma omp for nowait
for( int b = 0; b < blocks; b++ ) {
const int offset = b * block_size;
for( IndexType i = 0; i < block_size; i += 4 ) {
const IndexType offset = b * block_size;
for( int i = 0; i < block_size; i += 4 ) {
operation.firstReduction( r[ 0 ], offset + i, input1, input2 );
operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
......@@ -209,7 +209,7 @@ reduce( Operation& operation,
operation.firstReduction( r[ 0 ], i, input1, input2 );
}
// reduction of local results
// local reduction of unrolled results
operation.commonReduction( r[ 0 ], r[ 1 ] );
operation.commonReduction( r[ 0 ], r[ 2 ] );
operation.commonReduction( r[ 0 ], r[ 3 ] );
......@@ -224,10 +224,38 @@ reduce( Operation& operation,
}
else {
#endif
ResultType result = operation.initialValue();
for( IndexType i = 0; i < size; i++ )
operation.firstReduction( result, i, input1, input2 );
return result;
if( blocks > 1 ) {
// initialize array for unrolled results
ResultType r[ 4 ] = { operation.initialValue() };
// main reduction (explicitly unrolled loop)
for( int b = 0; b < blocks; b++ ) {
const IndexType offset = b * block_size;
for( int i = 0; i < block_size; i += 4 ) {
operation.firstReduction( r[ 0 ], offset + i, input1, input2 );
operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
}
}
// reduction of the last, incomplete block (not unrolled)
for( IndexType i = blocks * block_size; i < size; i++ )
operation.firstReduction( r[ 0 ], i, input1, input2 );
// reduction of unrolled results
operation.commonReduction( r[ 0 ], r[ 1 ] );
operation.commonReduction( r[ 0 ], r[ 2 ] );
operation.commonReduction( r[ 0 ], r[ 3 ] );
return r[ 0 ];
}
else {
ResultType result = operation.initialValue();
for( IndexType i = 0; i < size; i++ )
operation.firstReduction( result, i, input1, input2 );
return result;
}
#ifdef HAVE_OPENMP
}
#endif
......
......@@ -933,7 +933,7 @@ class EllpackDeviceDependentCode< Devices::Cuda >
//Devices::Cuda::freeFromDevice( kernel_inVector );
//Devices::Cuda::freeFromDevice( kernel_outVector );
TNL_CHECK_CUDA_DEVICE;
cudaThreadSynchronize();
cudaDeviceSynchronize();
#endif
}
......
......@@ -1051,7 +1051,7 @@ class SlicedEllpackDeviceDependentCode< Devices::Cuda >
//Devices::Cuda::freeFromDevice( kernel_inVector );
//Devices::Cuda::freeFromDevice( kernel_outVector );
TNL_CHECK_CUDA_DEVICE;
cudaThreadSynchronize();
cudaDeviceSynchronize();
#endif
}
......
......@@ -57,7 +57,8 @@ class DistributedGridIO<Functions::MeshFunction<MeshType>,LocalCopy,Device>
newMesh->setOrigin(origin+TNL::Containers::Scale(spaceSteps,localBegin));
File meshFile;
meshFile.open( fileName+String("-mesh-")+distrGrid->printProcessCoords()+String(".tnl"),IOMode::write);
bool ok=meshFile.open( fileName+String("-mesh-")+distrGrid->printProcessCoords()+String(".tnl"),IOMode::write);
TNL_ASSERT_TRUE(ok,"Not able to open mesh file to write");
newMesh->save( meshFile );
meshFile.close();
......@@ -72,7 +73,8 @@ class DistributedGridIO<Functions::MeshFunction<MeshType>,LocalCopy,Device>
CopyEntitiesHelper<MeshFunctionType>::Copy(meshFunction,newMeshFunction,localBegin,zeroCoord,localSize);
File file;
file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), IOMode::write );
ok=file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), IOMode::write );
TNL_ASSERT_TRUE(ok,"Not able to open file to write");
bool ret=newMeshFunction.save(file);
file.close();
......@@ -110,7 +112,8 @@ class DistributedGridIO<Functions::MeshFunction<MeshType>,LocalCopy,Device>
zeroCoord.setValue(0);
File file;
file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), IOMode::read );
bool ok=file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), IOMode::read );
TNL_ASSERT_TRUE(ok,"Not able to open file to read");
bool result=newMeshFunction.boundLoad(file);
file.close();
CopyEntitiesHelper<MeshFunctionType>::Copy(newMeshFunction,meshFunction,zeroCoord,localBegin,localSize);
......@@ -151,12 +154,13 @@ class DistributedGridIO_MPIIOBase
MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
MPI_File file;
MPI_File_open( group,
int ok=MPI_File_open( group,
const_cast< char* >( fileName.getString() ),
MPI_MODE_CREATE | MPI_MODE_WRONLY,
MPI_INFO_NULL,
&file);
TNL_ASSERT_EQ(ok,0,"Open file falied");
int written=save(file,meshFunction, data,0);
MPI_File_close(&file);
......@@ -322,13 +326,14 @@ class DistributedGridIO_MPIIOBase
MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
MPI_File file;
MPI_File_open( group,
int ok=MPI_File_open( group,
const_cast< char* >( fileName.getString() ),
MPI_MODE_RDONLY,
MPI_INFO_NULL,
&file );
TNL_ASSERT_EQ(ok,0,"Open file falied");
bool ret= load(file, meshFunction, data,0)>0;
bool ret= load(file, meshFunction, data,0)>0;
MPI_File_close(&file);
......
......@@ -204,7 +204,7 @@ void Euler< Problem > :: computeNewTimeLevel( DofVectorPointer& u,
&_u[ gridOffset ],
this->cudaBlockResidue.getData() );
localResidue += this->cudaBlockResidue.sum();
cudaThreadSynchronize();
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
}
#endif
......
......@@ -305,7 +305,7 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
this->problem->getExplicitUpdate( time, tau, u, k1 );
cudaThreadSynchronize();
cudaDeviceSynchronize();
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
{
......@@ -313,10 +313,10 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
computeK2Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_kAux[ gridOffset ] );
}
cudaThreadSynchronize();
cudaDeviceSynchronize();
this->problem->applyBoundaryConditions( time + tau_3, kAux );
this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 );
cudaThreadSynchronize();
cudaDeviceSynchronize();
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
{
......@@ -324,10 +324,10 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
computeK3Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k2[ gridOffset ], &_kAux[ gridOffset ] );
}
cudaThreadSynchronize();
cudaDeviceSynchronize();
this->problem->applyBoundaryConditions( time + tau_3, kAux );
this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 );
cudaThreadSynchronize();
cudaDeviceSynchronize();
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
{
......@@ -335,10 +335,10 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
computeK4Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_kAux[ gridOffset ] );
}
cudaThreadSynchronize();
cudaDeviceSynchronize();
this->problem->applyBoundaryConditions( time + 0.5 * tau, kAux );
this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 );
cudaThreadSynchronize();
cudaDeviceSynchronize();
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
{
......@@ -346,10 +346,10 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
computeK5Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_k4[ gridOffset ], &_kAux[ gridOffset ] );
}
cudaThreadSynchronize();
cudaDeviceSynchronize();
this->problem->applyBoundaryConditions( time + tau, kAux );
this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 );
cudaThreadSynchronize();
cudaDeviceSynchronize();
#endif
}
}
......@@ -409,7 +409,7 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t
&_k4[ gridOffset ],
&_k5[ gridOffset ],
&_kAux[ gridOffset ] );
cudaThreadSynchronize();
cudaDeviceSynchronize();
eps = std::max( eps, kAux->max() );
}
#endif
......@@ -468,7 +468,7 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time,
&_u[ gridOffset ],
this->cudaBlockResidue.getData() );
localResidue += this->cudaBlockResidue.sum();
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
this->problem->applyBoundaryConditions( time, u );
......
......@@ -66,6 +66,20 @@ ELSE( BUILD_CUDA )
tnl )
ENDIF( BUILD_CUDA )
IF( BUILD_CUDA )
CUDA_ADD_EXECUTABLE( MultireductionTest MultireductionTest.cu
OPTIONS ${CXX_TESTS_FLAGS} )
TARGET_LINK_LIBRARIES( MultireductionTest
${GTEST_BOTH_LIBRARIES}
tnl )
ELSE( BUILD_CUDA )
ADD_EXECUTABLE( MultireductionTest MultireductionTest.cpp )
TARGET_COMPILE_OPTIONS( MultireductionTest PRIVATE ${CXX_TESTS_FLAGS} )
TARGET_LINK_LIBRARIES( MultireductionTest
${GTEST_BOTH_LIBRARIES}
tnl )
ENDIF( BUILD_CUDA )
ADD_EXECUTABLE( StaticVectorTest StaticVectorTest.cpp )
TARGET_COMPILE_OPTIONS( StaticVectorTest PRIVATE ${CXX_TESTS_FLAGS} )
TARGET_LINK_LIBRARIES( StaticVectorTest
......@@ -93,6 +107,7 @@ ADD_TEST( ArrayTest ${EXECUTABLE_OUTPUT_PATH}/ArrayTest${CMAKE_EXECUTABLE_SUFFIX
ADD_TEST( ArrayViewTest ${EXECUTABLE_OUTPUT_PATH}/ArrayViewTest${CMAKE_EXECUTABLE_SUFFIX} )
ADD_TEST( StaticArrayTest ${EXECUTABLE_OUTPUT_PATH}/StaticArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
ADD_TEST( VectorTest ${EXECUTABLE_OUTPUT_PATH}/VectorTest${CMAKE_EXECUTABLE_SUFFIX} )
ADD_TEST( MultireductionTest ${EXECUTABLE_OUTPUT_PATH}/MultireductionTest${CMAKE_EXECUTABLE_SUFFIX} )
ADD_TEST( StaticVectorTest ${EXECUTABLE_OUTPUT_PATH}/StaticVectorTest${CMAKE_EXECUTABLE_SUFFIX} )
#ADD_TEST( MultiArrayTest ${EXECUTABLE_OUTPUT_PATH}/MultiArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
......
#include "MultireductionTest.h"
#include "MultireductionTest.h"
/***************************************************************************
MultireductionTest.h - description
-------------------
begin : Oct 1, 2018
copyright : (C) 2018 by Tomas Oberhuber et al.
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
#pragma once
#ifdef HAVE_GTEST
#include "gtest/gtest.h"
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/VectorView.h>
#include <TNL/Containers/Algorithms/Multireduction.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
template< typename View >
void setLinearSequence( View& deviceVector )
{
using HostVector = Containers::Vector< typename View::RealType, Devices::Host, typename View::IndexType >;
HostVector a;
a.setLike( deviceVector );
for( int i = 0; i < a.getSize(); i++ )
a[ i ] = i;
deviceVector = a;
}
template< typename View >
void setNegativeLinearSequence( View& deviceVector )
{
using HostVector = Containers::Vector< typename View::RealType, Devices::Host, typename View::IndexType >;
HostVector a;
a.setLike( deviceVector );
for( int i = 0; i < a.getSize(); i++ )
a[ i ] = -i;
deviceVector = a;
}
// test fixture for typed tests
template< typename Vector >
class MultireductionTest : public ::testing::Test
{
protected:
using DeviceVector = Vector;
using DeviceView = VectorView< typename Vector::RealType, typename Vector::DeviceType, typename Vector::IndexType >;
using HostVector = typename DeviceVector::HostType;
using HostView = typename DeviceView::HostType;
// should be small enough to have fast tests, but larger than minGPUReductionDataSize
// and large enough to require multiple CUDA blocks for reduction
static constexpr int size = 5000;
// number of vectors which are reduced together
static constexpr int n = 4;
DeviceVector V;
DeviceVector y;
HostVector result;
MultireductionTest()
{
V.setSize( size * n );
y.setSize( size );
result.setSize( n );
for( int i = 0; i < n; i++ ) {
DeviceView v( &V[ i * size ], size );
if( i % 2 == 0 )
setLinearSequence( v );
else
setNegativeLinearSequence( v );
}
y.setValue( 1 );
}
};
// types for which MultireductionTest is instantiated
using VectorTypes = ::testing::Types<
Vector< int, Devices::Host >,
Vector< float, Devices::Host >
#ifdef HAVE_CUDA
,
Vector< int, Devices::Cuda >,
Vector< float, Devices::Cuda >
#endif
>;
TYPED_TEST_CASE( MultireductionTest, VectorTypes );
TYPED_TEST( MultireductionTest, scalarProduct )
{
using RealType = typename TestFixture::DeviceVector::RealType;
using DeviceType = typename TestFixture::DeviceVector::DeviceType;
ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
Multireduction< DeviceType >::reduce
( scalarProduct,
this->n,
this->size,
this->V.getData(),
this->size,
this->y.getData(),
this->result.getData() );
for( int i = 0; i < this->n; i++ ) {
if( i % 2 == 0 )
EXPECT_EQ( this->result[ i ], 0.5 * this->size * ( this->size - 1 ) );
else
EXPECT_EQ( this->result[ i ], - 0.5 * this->size * ( this->size - 1 ) );
}
}
#endif // HAVE_GTEST
#include "../GtestMissingError.h"
int main( int argc, char* argv[] )
{
#ifdef HAVE_GTEST
::testing::InitGoogleTest( &argc, argv );
return RUN_ALL_TESTS();
#else
throw GtestMissingError();
#endif
}