Loading src/TNL/Containers/Algorithms/CMakeLists.txt +2 −3 Original line number Diff line number Diff line Loading @@ -5,11 +5,10 @@ set( headers cuda-prefix-sum.h cuda-reduction.h cuda-reduction_impl.h reduction-operations.h CudaReduction.h CudaReduction_impl.h CudaReductionBuffer.h CublasWrapper.h CudaMultireductionKernel.h CudaReductionBuffer.h CudaReductionKernel.h Multireduction.h Multireduction_impl.h ) Loading src/TNL/Containers/Algorithms/CudaReduction.hdeleted 100644 → 0 +0 −45 Original line number Diff line number Diff line /*************************************************************************** CudaReduction.h - description ------------------- begin : Jun 17, 2015 copyright : (C) 2015 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once namespace TNL { namespace Containers { namespace Algorithms { #ifdef HAVE_CUDA template< typename Operation, int blockSize > class CudaReduction { public: typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; __device__ static void reduce( Operation& operation, const IndexType size, const RealType* input1, const RealType* input2, ResultType* output ); }; #endif } // namespace Algorithms } // namespace Containers } // namespace TNL #ifdef HAVE_CUDA #include <TNL/Containers/Algorithms/CudaReduction_impl.h> #endif src/TNL/Containers/Algorithms/CudaReduction_impl.h→src/TNL/Containers/Algorithms/CudaReductionKernel.h +90 −10 Original line number Diff line number Diff line /*************************************************************************** CudaReduction_impl.h - description CudaReductionKernel.h - description ------------------- begin : Jun 17, 2015 copyright : (C) 2015 by Tomas Oberhuber copyright : (C) 2015 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ Loading @@ -14,16 +14,20 @@ namespace TNL { namespace Containers { namespace Algorithms { #ifdef HAVE_CUDA template< typename Operation, int blockSize > __device__ __global__ void CudaReduction< Operation, blockSize >:: reduce( Operation& operation, const IndexType size, const RealType* input1, const RealType* input2, ResultType* output ) CudaReductionKernel( Operation& operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::RealType* input2, typename Operation::ResultType* output ) { typedef typename Operation::IndexType IndexType; typedef typename Operation::ResultType ResultType; extern __shared__ __align__ ( 8 ) char __sdata[]; ResultType* sdata = reinterpret_cast< ResultType* >( __sdata ); Loading Loading @@ -148,7 +152,83 @@ reduce( Operation& operation, } template< typename Operation > typename Operation::IndexType CudaReductionKernelLauncher( Operation& operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::RealType* input2, typename Operation::ResultType*& output ) { typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; // TODO: optimize similarly to multireduction const int minGPUReductionDataSize = 256; const IndexType desGridSize( minGPUReductionDataSize ); dim3 blockSize( 256 ), gridSize( 0 ); gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // create reference to the reduction buffer singleton and set default size CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance( 8 * minGPUReductionDataSize ); if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) ) return false; output = cudaReductionBuffer.template getData< ResultType >(); IndexType shmem = blockSize.x * sizeof( ResultType ); /*** * Depending on the blockSize we generate appropriate template instance. */ switch( blockSize.x ) { case 512: CudaReductionKernel< Operation, 512 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 256: CudaReductionKernel< Operation, 256 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 128: CudaReductionKernel< Operation, 128 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 64: CudaReductionKernel< Operation, 64 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 32: CudaReductionKernel< Operation, 32 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 16: CudaReductionKernel< Operation, 16 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 8: CudaReductionKernel< Operation, 8 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 4: CudaReductionKernel< Operation, 4 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 2: CudaReductionKernel< Operation, 2 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 1: Assert( false, std::cerr << "blockSize should not be 1." << std::endl ); default: Assert( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } //checkCudaDevice; return gridSize. x; } #endif } // namespace Algorithms } // namespace Containers } // namespace TNL src/TNL/Containers/Algorithms/cuda-reduction_impl.h +6 −92 Original line number Diff line number Diff line Loading @@ -20,7 +20,7 @@ #include <TNL/Containers/ArrayOperations.h> #include <TNL/Math.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h> #include <TNL/Containers/Algorithms/CudaReduction.h> #include <TNL/Containers/Algorithms/CudaReductionKernel.h> #ifdef CUDA_REDUCTION_PROFILING #include <iostream> Loading @@ -40,92 +40,6 @@ namespace Algorithms { const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256; #ifdef HAVE_CUDA template< typename Operation, int blockSize > __global__ void CudaReductionKernel( Operation operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::RealType* input2, typename Operation::ResultType* output ) { typedef CudaReduction< Operation, blockSize > Reduction; Reduction::reduce( operation, size, input1, input2, output ); }; template< typename Operation > typename Operation::IndexType reduceOnCudaDevice( Operation& operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::RealType* input2, typename Operation::ResultType*& output) { typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; const IndexType desGridSize( minGPUReductionDataSize ); dim3 blockSize( 256 ), gridSize( 0 ); gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // create reference to the reduction buffer singleton and set default size CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance( 8 * minGPUReductionDataSize ); if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) ) return false; output = cudaReductionBuffer.template getData< ResultType >(); IndexType shmem = blockSize.x * sizeof( ResultType ); /*** * Depending on the blockSize we generate appropriate template instance. */ switch( blockSize.x ) { case 512: CudaReductionKernel< Operation, 512 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 256: CudaReductionKernel< Operation, 256 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 128: CudaReductionKernel< Operation, 128 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 64: CudaReductionKernel< Operation, 64 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 32: CudaReductionKernel< Operation, 32 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 16: CudaReductionKernel< Operation, 16 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 8: CudaReductionKernel< Operation, 8 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 4: CudaReductionKernel< Operation, 4 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 2: CudaReductionKernel< Operation, 2 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 1: Assert( false, std::cerr << "blockSize should not be 1." << std::endl ); default: Assert( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } //checkCudaDevice; return gridSize. x; } #endif template< typename Operation > Loading Loading @@ -172,7 +86,7 @@ reductionOnCudaDevice( Operation& operation, * Reduce the data on the CUDA device. */ ResultType* deviceAux1( 0 ); IndexType reducedSize = reduceOnCudaDevice( operation, IndexType reducedSize = CudaReductionKernelLauncher( operation, size, deviceInput1, deviceInput2, Loading Loading
src/TNL/Containers/Algorithms/CMakeLists.txt +2 −3 Original line number Diff line number Diff line Loading @@ -5,11 +5,10 @@ set( headers cuda-prefix-sum.h cuda-reduction.h cuda-reduction_impl.h reduction-operations.h CudaReduction.h CudaReduction_impl.h CudaReductionBuffer.h CublasWrapper.h CudaMultireductionKernel.h CudaReductionBuffer.h CudaReductionKernel.h Multireduction.h Multireduction_impl.h ) Loading
src/TNL/Containers/Algorithms/CudaReduction.hdeleted 100644 → 0 +0 −45 Original line number Diff line number Diff line /*************************************************************************** CudaReduction.h - description ------------------- begin : Jun 17, 2015 copyright : (C) 2015 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once namespace TNL { namespace Containers { namespace Algorithms { #ifdef HAVE_CUDA template< typename Operation, int blockSize > class CudaReduction { public: typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; __device__ static void reduce( Operation& operation, const IndexType size, const RealType* input1, const RealType* input2, ResultType* output ); }; #endif } // namespace Algorithms } // namespace Containers } // namespace TNL #ifdef HAVE_CUDA #include <TNL/Containers/Algorithms/CudaReduction_impl.h> #endif
src/TNL/Containers/Algorithms/CudaReduction_impl.h→src/TNL/Containers/Algorithms/CudaReductionKernel.h +90 −10 Original line number Diff line number Diff line /*************************************************************************** CudaReduction_impl.h - description CudaReductionKernel.h - description ------------------- begin : Jun 17, 2015 copyright : (C) 2015 by Tomas Oberhuber copyright : (C) 2015 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ Loading @@ -14,16 +14,20 @@ namespace TNL { namespace Containers { namespace Algorithms { #ifdef HAVE_CUDA template< typename Operation, int blockSize > __device__ __global__ void CudaReduction< Operation, blockSize >:: reduce( Operation& operation, const IndexType size, const RealType* input1, const RealType* input2, ResultType* output ) CudaReductionKernel( Operation& operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::RealType* input2, typename Operation::ResultType* output ) { typedef typename Operation::IndexType IndexType; typedef typename Operation::ResultType ResultType; extern __shared__ __align__ ( 8 ) char __sdata[]; ResultType* sdata = reinterpret_cast< ResultType* >( __sdata ); Loading Loading @@ -148,7 +152,83 @@ reduce( Operation& operation, } template< typename Operation > typename Operation::IndexType CudaReductionKernelLauncher( Operation& operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::RealType* input2, typename Operation::ResultType*& output ) { typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; // TODO: optimize similarly to multireduction const int minGPUReductionDataSize = 256; const IndexType desGridSize( minGPUReductionDataSize ); dim3 blockSize( 256 ), gridSize( 0 ); gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // create reference to the reduction buffer singleton and set default size CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance( 8 * minGPUReductionDataSize ); if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) ) return false; output = cudaReductionBuffer.template getData< ResultType >(); IndexType shmem = blockSize.x * sizeof( ResultType ); /*** * Depending on the blockSize we generate appropriate template instance. */ switch( blockSize.x ) { case 512: CudaReductionKernel< Operation, 512 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 256: CudaReductionKernel< Operation, 256 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 128: CudaReductionKernel< Operation, 128 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 64: CudaReductionKernel< Operation, 64 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 32: CudaReductionKernel< Operation, 32 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 16: CudaReductionKernel< Operation, 16 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 8: CudaReductionKernel< Operation, 8 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 4: CudaReductionKernel< Operation, 4 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 2: CudaReductionKernel< Operation, 2 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 1: Assert( false, std::cerr << "blockSize should not be 1." << std::endl ); default: Assert( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } //checkCudaDevice; return gridSize. x; } #endif } // namespace Algorithms } // namespace Containers } // namespace TNL
src/TNL/Containers/Algorithms/cuda-reduction_impl.h +6 −92 Original line number Diff line number Diff line Loading @@ -20,7 +20,7 @@ #include <TNL/Containers/ArrayOperations.h> #include <TNL/Math.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h> #include <TNL/Containers/Algorithms/CudaReduction.h> #include <TNL/Containers/Algorithms/CudaReductionKernel.h> #ifdef CUDA_REDUCTION_PROFILING #include <iostream> Loading @@ -40,92 +40,6 @@ namespace Algorithms { const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256; #ifdef HAVE_CUDA template< typename Operation, int blockSize > __global__ void CudaReductionKernel( Operation operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::RealType* input2, typename Operation::ResultType* output ) { typedef CudaReduction< Operation, blockSize > Reduction; Reduction::reduce( operation, size, input1, input2, output ); }; template< typename Operation > typename Operation::IndexType reduceOnCudaDevice( Operation& operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::RealType* input2, typename Operation::ResultType*& output) { typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; const IndexType desGridSize( minGPUReductionDataSize ); dim3 blockSize( 256 ), gridSize( 0 ); gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // create reference to the reduction buffer singleton and set default size CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance( 8 * minGPUReductionDataSize ); if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) ) return false; output = cudaReductionBuffer.template getData< ResultType >(); IndexType shmem = blockSize.x * sizeof( ResultType ); /*** * Depending on the blockSize we generate appropriate template instance. */ switch( blockSize.x ) { case 512: CudaReductionKernel< Operation, 512 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 256: CudaReductionKernel< Operation, 256 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 128: CudaReductionKernel< Operation, 128 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 64: CudaReductionKernel< Operation, 64 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 32: CudaReductionKernel< Operation, 32 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 16: CudaReductionKernel< Operation, 16 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 8: CudaReductionKernel< Operation, 8 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 4: CudaReductionKernel< Operation, 4 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 2: CudaReductionKernel< Operation, 2 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 1: Assert( false, std::cerr << "blockSize should not be 1." << std::endl ); default: Assert( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } //checkCudaDevice; return gridSize. x; } #endif template< typename Operation > Loading Loading @@ -172,7 +86,7 @@ reductionOnCudaDevice( Operation& operation, * Reduce the data on the CUDA device. */ ResultType* deviceAux1( 0 ); IndexType reducedSize = reduceOnCudaDevice( operation, IndexType reducedSize = CudaReductionKernelLauncher( operation, size, deviceInput1, deviceInput2, Loading