diff --git a/src/core/tnlCuda.h b/src/core/tnlCuda.h index 2429b9e98e1ddbc99dad6a8f5198329b9a0f1599..3e48f30d256730720eaea3924bc5199192e63a7b 100644 --- a/src/core/tnlCuda.h +++ b/src/core/tnlCuda.h @@ -27,57 +27,29 @@ class tnlCuda { public: - static tnlString getDeviceType() - { - return tnlString( "tnlCuda" ); - } + static tnlString getDeviceType(); - static tnlDeviceEnum getDevice() - { - return tnlCudaDevice; - }; + static tnlDeviceEnum getDevice(); template< typename Element, typename Index > - static void allocateMemory( Element*& data, const Index size ) - { - allocateMemoryCuda( data, size ); - } + static void allocateMemory( Element*& data, const Index size ); template< typename Element > - static void freeMemory( Element* data ) - { - freeMemoryCuda( data ); - } + static void freeMemory( Element* data ); template< typename Element > static void setMemoryElement( Element* data, - const Element& value ) - { - setMemoryCuda( data, value, 1 ); - } + const Element& value ); template< typename Element > - static Element getMemoryElement( const Element* data ) - { - Element result; - copyMemoryCudaToHost( &result, data, 1 ); - return result; - } + static Element getMemoryElement( const Element* data ); template< typename Element, typename Index > - static Element& getArrayElementReference( Element* data, const Index i ) - { - tnlAssert( false, ); - abort(); - } + static Element& getArrayElementReference( Element* data, const Index i ); template< typename Element, typename Index > - static const Element& getArrayElementReference(const Element* data, const Index i ) - { - tnlAssert( false, ); - abort(); - } + static const Element& getArrayElementReference(const Element* data, const Index i ); template< typename DestinationElement, typename SourceElement, @@ -85,76 +57,36 @@ class tnlCuda typename Device > static bool memcpy( DestinationElement* destination, const SourceElement* source, - const Index size ) - { - switch( Device :: getDevice() ) - { - case tnlHostDevice: - return copyMemoryHostToCuda( destination, source, size ); - case tnlCudaDevice: - return copyMemoryCudaToCuda( destination, source, size ); - } - return true; - } - + const Index size ); template< typename Element, typename Index, typename Device > static bool memcpy( Element* destination, const Element* source, - const Index size ) - { - return tnlCuda :: memcpy< Element, Element, Index, Device > - ( destination, - source, - size ); - } + const Index size ); template< typename Element, typename Index, typename Device > static bool memcmp( const Element* data1, const Element* data2, - const Index size ) - { - switch( Device :: getDevice() ) - { - case tnlHostDevice: - return compareMemoryHostCuda( data2, data1, size ); - case tnlCudaDevice: - return compareMemoryCuda( data1, data2, size ); - } - } + const Index size ); template< typename Element, typename Index > static bool memset( Element* destination, const Element& value, - const Index size ) - { - return setMemoryCuda( destination, value, size ); - } - - static int getMaxGridSize() - { - return maxGridSize; - } - - static void setMaxGridSize( int newMaxGridSize ) - { - maxGridSize = newMaxGridSize; - } - - static int getMaxBlockSize() - { - return maxBlockSize; - } - - static void setMaxBlockSize( int newMaxBlockSize ) - { - maxBlockSize = newMaxBlockSize; - } + const Index size ); + + static int getMaxGridSize(); + + static void setMaxGridSize( int newMaxGridSize ); + + static int getMaxBlockSize(); + + static void setMaxBlockSize( int newMaxBlockSize ); protected: static int maxGridSize, maxBlockSize; }; +#include <implementation/core/tnlCuda_impl.h> #endif /* TNLCUDA_H_ */ diff --git a/src/implementation/core/CMakeLists.txt b/src/implementation/core/CMakeLists.txt index 0ccd327a2cb0a291fd28cee923d7ce075677353a..9fa5e317c9298e97ab004aa3c2556530f1fcbd82 100755 --- a/src/implementation/core/CMakeLists.txt +++ b/src/implementation/core/CMakeLists.txt @@ -3,6 +3,7 @@ ADD_SUBDIRECTORY( cuda ) SET( headers vector-operations.h memory-operations.h tnlArray_impl.h + tnlCuda_impl.h tnlHost_impl.h tnlLogger_impl.h tnlMultiArray1D_impl.h diff --git a/src/implementation/core/memory-operations.h b/src/implementation/core/memory-operations.h index c5c14fcd7b61b54baee7b1c83b0367b9a6228ae6..3dfd2ee83de9d03561de7bd2b5f079c95d8cbfed 100644 --- a/src/implementation/core/memory-operations.h +++ b/src/implementation/core/memory-operations.h @@ -87,11 +87,11 @@ __global__ void setVectorValueCudaKernel( Element* data, const Element value ) { Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x; - const Index gridSize = blockDim. x * gridDim. x; + const Index maxGridSize = blockDim. x * gridDim. x; while( elementIdx < size ) { data[ elementIdx ] = value; - elementIdx += gridSize; + elementIdx += maxGridSize; } } #endif @@ -159,6 +159,42 @@ bool copyMemoryHostToCuda( Element* destination, #endif } +template< typename DestinationElement, + typename SourceElement, + typename Index > +bool copyMemoryHostToCuda( DestinationElement* destination, + const SourceElement* source, + const Index size ) +{ +#ifdef HAVE_CUDA + DestinationElement* buffer = new DestinationElement[ tnlGPUvsCPUTransferBufferSize ]; + if( ! buffer ) + { + cerr << "Unable to allocate supporting buffer to transfer data between the CUDA device and the host." << endl; + return false; + } + Index i( 0 ); + while( i < size ) + { + Index j( 0 ); + while( j < tnlGPUvsCPUTransferBufferSize && i + j < size ) + buffer[ j ] = source[ i + j++ ]; + if( ! copyMemoryHostTuCuda( buffer, + &destination[ i ], + j ) ) + { + delete[] buffer; + return false; + } + i += j; + } + delete[] buffer; + return true; +#else + cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl; + return false; +#endif +} template< typename Element, typename Index > bool copyMemoryCudaToHost( Element* destination, @@ -189,22 +225,29 @@ bool copyMemoryCudaToHost( DestinationElement* destination, const SourceElement* source, const Index size ) { -#ifdef HAVE_CUDA - abort(); // TODO: fix this - cudaMemcpy( destination, - source, - size * sizeof( SourceElement ), - cudaMemcpyDeviceToHost ); - if( ! checkCudaDevice ) + SourceElement* buffer = new SourceElement[ tnlGPUvsCPUTransferBufferSize ]; + if( ! buffer ) { - cerr << "Transfer of data from CUDA device to host failed." << endl; + cerr << "Unable to allocate supporting buffer to transfer data between the CUDA device and the host." << endl; return false; } + Index i( 0 ); + while( i < size ) + { + if( ! copyMemoryCudaToHost( &source[ i ], + buffer, + Min( size - i, tnlGPUvsCPUTransferBufferSize ) ) ) + { + delete[] buffer; + return false; + } + Index j( 0 ); + while( j < tnlGPUvsCPUTransferBufferSize && i + j < size ) + destination[ i + j ] = buffer[ j++ ]; + i += j; + } + delete[] buffer; return true; -#else - cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl; - return false; -#endif } @@ -363,19 +406,19 @@ extern template bool setMemoryHost( float* data, const float& value, const long extern template bool setMemoryHost( double* data, const double& value, const long int size ); extern template bool setMemoryHost( long double* data, const long double& value, const long int size ); -extern template bool setMemoryCuda( char* data, const char& value, const int size ); -extern template bool setMemoryCuda( int* data, const int& value, const int size ); -extern template bool setMemoryCuda( long int* data, const long int& value, const int size ); -extern template bool setMemoryCuda( float* data, const float& value, const int size ); -extern template bool setMemoryCuda( double* data, const double& value, const int size ); -extern template bool setMemoryCuda( long double* data, const long double& value, const int size ); - -extern template bool setMemoryCuda( char* data, const char& value, const long int size ); -extern template bool setMemoryCuda( int* data, const int& value, const long int size ); -extern template bool setMemoryCuda( long int* data, const long int& value, const long int size ); -extern template bool setMemoryCuda( float* data, const float& value, const long int size ); -extern template bool setMemoryCuda( double* data, const double& value, const long int size ); -extern template bool setMemoryCuda( long double* data, const long double& value, const long int size ); +extern template bool setMemoryCuda( char* data, const char& value, const int size, const int maxGridSize ); +extern template bool setMemoryCuda( int* data, const int& value, const int size, const int maxGridSize ); +extern template bool setMemoryCuda( long int* data, const long int& value, const int size, const int maxGridSize ); +extern template bool setMemoryCuda( float* data, const float& value, const int size, const int maxGridSize ); +extern template bool setMemoryCuda( double* data, const double& value, const int size, const int maxGridSize ); +extern template bool setMemoryCuda( long double* data, const long double& value, const int size, const int maxGridSize ); + +extern template bool setMemoryCuda( char* data, const char& value, const long int size, const int maxGridSize ); +extern template bool setMemoryCuda( int* data, const int& value, const long int size, const int maxGridSize ); +extern template bool setMemoryCuda( long int* data, const long int& value, const long int size, const int maxGridSize ); +extern template bool setMemoryCuda( float* data, const float& value, const long int size, const int maxGridSize ); +extern template bool setMemoryCuda( double* data, const double& value, const long int size, const int maxGridSize ); +extern template bool setMemoryCuda( long double* data, const long double& value, const long int size, const int maxGridSize ); extern template bool copyMemoryHostToHost( char* destination, const char* source, const int size ); extern template bool copyMemoryHostToHost( int* destination, const int* source, const int size ); diff --git a/src/implementation/core/memory-operations_impl.cu b/src/implementation/core/memory-operations_impl.cu index 17535c9a1788286d3ae9d927092bd81cd7e6593d..b493f1c0ad803cce93915d9b549f79f214198c75 100644 --- a/src/implementation/core/memory-operations_impl.cu +++ b/src/implementation/core/memory-operations_impl.cu @@ -75,19 +75,19 @@ template bool setMemoryHost( float* data, const float& value, const long int siz template bool setMemoryHost( double* data, const double& value, const long int size ); template bool setMemoryHost( long double* data, const long double& value, const long int size ); -template bool setMemoryCuda( char* data, const char& value, const int size ); -template bool setMemoryCuda( int* data, const int& value, const int size ); -template bool setMemoryCuda( long int* data, const long int& value, const int size ); -template bool setMemoryCuda( float* data, const float& value, const int size ); -template bool setMemoryCuda( double* data, const double& value, const int size ); -template bool setMemoryCuda( long double* data, const long double& value, const int size ); - -template bool setMemoryCuda( char* data, const char& value, const long int size ); -template bool setMemoryCuda( int* data, const int& value, const long int size ); -template bool setMemoryCuda( long int* data, const long int& value, const long int size ); -template bool setMemoryCuda( float* data, const float& value, const long int size ); -template bool setMemoryCuda( double* data, const double& value, const long int size ); -template bool setMemoryCuda( long double* data, const long double& value, const long int size ); +template bool setMemoryCuda( char* data, const char& value, const int size, const int maxGridSize ); +template bool setMemoryCuda( int* data, const int& value, const int size, const int maxGridSize ); +template bool setMemoryCuda( long int* data, const long int& value, const int size, const int maxGridSize ); +template bool setMemoryCuda( float* data, const float& value, const int size, const int maxGridSize ); +template bool setMemoryCuda( double* data, const double& value, const int size, const int maxGridSize ); +template bool setMemoryCuda( long double* data, const long double& value, const int size, const int maxGridSize ); + +template bool setMemoryCuda( char* data, const char& value, const long int size, const int maxGridSize ); +template bool setMemoryCuda( int* data, const int& value, const long int size, const int maxGridSize ); +template bool setMemoryCuda( long int* data, const long int& value, const long int size, const int maxGridSize ); +template bool setMemoryCuda( float* data, const float& value, const long int size, const int maxGridSize ); +template bool setMemoryCuda( double* data, const double& value, const long int size, const int maxGridSize ); +template bool setMemoryCuda( long double* data, const long double& value, const long int size, const int maxGridSize ); template bool copyMemoryHostToHost( char* destination, const char* source, const int size ); template bool copyMemoryHostToHost( int* destination, const int* source, const int size ); @@ -181,4 +181,4 @@ template bool compareMemoryCuda( const float* data1, const float* data2, const l template bool compareMemoryCuda( const double* data1, const double* data2, const long int size ); template bool compareMemoryCuda( const long double* data1, const long double* data2, const long int size ); -#endif \ No newline at end of file +#endif diff --git a/src/implementation/core/tnlCuda_impl.h b/src/implementation/core/tnlCuda_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..dcaa8df4984377a3f4c6d7a7e4a8b8fb06e24b41 --- /dev/null +++ b/src/implementation/core/tnlCuda_impl.h @@ -0,0 +1,146 @@ +/*************************************************************************** + tnlCuda_impl.h - description + ------------------- + begin : Jul 11, 2013 + copyright : (C) 2013 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#ifndef TNLCUDA_IMPL_H_ +#define TNLCUDA_IMPL_H_ + +inline tnlString tnlCuda :: getDeviceType() +{ + return tnlString( "tnlCuda" ); +} + +inline tnlDeviceEnum tnlCuda :: getDevice() +{ + return tnlCudaDevice; +}; + +template< typename Element, typename Index > +void tnlCuda :: allocateMemory( Element*& data, const Index size ) +{ + allocateMemoryCuda( data, size ); +} + +template< typename Element > +void tnlCuda :: freeMemory( Element* data ) +{ + freeMemoryCuda( data ); +} + + +template< typename Element > +void tnlCuda :: setMemoryElement( Element* data, + const Element& value ) +{ + setMemoryCuda( data, value, 1, maxGridSize ); +} + +template< typename Element > +Element tnlCuda :: getMemoryElement( const Element* data ) +{ + Element result; + copyMemoryCudaToHost( &result, data, 1 ); + return result; +} + +template< typename Element, typename Index > +Element& tnlCuda :: getArrayElementReference( Element* data, const Index i ) +{ + tnlAssert( false, ); + abort(); +} + +template< typename Element, typename Index > +const Element& tnlCuda :: getArrayElementReference(const Element* data, const Index i ) +{ + tnlAssert( false, ); + abort(); +} + +template< typename DestinationElement, + typename SourceElement, + typename Index, + typename Device > +bool tnlCuda :: memcpy( DestinationElement* destination, + const SourceElement* source, + const Index size ) +{ + switch( Device :: getDevice() ) + { + case tnlHostDevice: + return copyMemoryHostToCuda( destination, source, size ); + case tnlCudaDevice: + return copyMemoryCudaToCuda( destination, source, size ); + } + return true; +} + + +template< typename Element, typename Index, typename Device > +bool tnlCuda :: memcpy( Element* destination, + const Element* source, + const Index size ) +{ + return tnlCuda :: memcpy< Element, Element, Index, Device > + ( destination, + source, + size ); +} + +template< typename Element, typename Index, typename Device > +bool tnlCuda :: memcmp( const Element* data1, + const Element* data2, + const Index size ) +{ + switch( Device :: getDevice() ) + { + case tnlHostDevice: + return compareMemoryHostCuda( data2, data1, size ); + case tnlCudaDevice: + return compareMemoryCuda( data1, data2, size ); + } +} + +template< typename Element, typename Index > +bool tnlCuda :: memset( Element* destination, + const Element& value, + const Index size ) +{ + return setMemoryCuda( destination, value, size, maxGridSize ); +} + +inline int tnlCuda :: getMaxGridSize() +{ + return maxGridSize; +} + +inline void tnlCuda :: setMaxGridSize( int newMaxGridSize ) +{ + maxGridSize = newMaxGridSize; +} + +inline int tnlCuda :: getMaxBlockSize() +{ + return maxBlockSize; +} + +inline void tnlCuda :: setMaxBlockSize( int newMaxBlockSize ) +{ + maxBlockSize = newMaxBlockSize; +} + + +#endif /* TNLCUDA_IMPL_H_ */ diff --git a/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h b/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h index 4ba5f58a38da9776709941fc9fff5b7208c537d1..6be0197eb1996c4f3475a65acac13f9539c4ac15 100644 --- a/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h +++ b/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h @@ -95,7 +95,7 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase allocateMemoryHost( hostData, size ); allocateMemoryCuda( deviceData, size ); setMemoryHost( hostData, 0, size ); - setMemoryCuda( deviceData, 13, size ); + setMemoryCuda( deviceData, 13, size, maxCudaGridSize ); CPPUNIT_ASSERT( checkCudaDevice ); copyMemoryCudaToHost( hostData, deviceData, size ); CPPUNIT_ASSERT( checkCudaDevice ); @@ -113,7 +113,7 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase allocateMemoryHost( hostData, size ); allocateMemoryCuda( deviceData, size ); setMemoryHost( hostData, 0, size ); - setMemoryCuda( deviceData, 13, size ); + setMemoryCuda( deviceData, 13, size, maxCudaGridSize ); CPPUNIT_ASSERT( checkCudaDevice ); copyMemoryCudaToHost( hostData, deviceData, size ); CPPUNIT_ASSERT( checkCudaDevice );