Loading src/Benchmarks/scripts/run-tnl-benchmark-spmv +1 −1 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ BENCHMARK_DBG="tnl-benchmark-spmv-dbg --with-legacy-matrices no" export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf" PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl" source matrix-market M_MATRICES="" #MM_MATRICES="" source florida-matrix-market #FLORIDA_MM_MATRICES="" Loading src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py +1 −1 Original line number Diff line number Diff line Loading @@ -254,7 +254,7 @@ formats = get_formats( input_df ) multicolumns, df_data = get_multiindex( input_df, formats ) print( "Converting data..." ) result = convert_data_frame( input_df, multicolumns, df_data, 200 ) result = convert_data_frame( input_df, multicolumns, df_data, 20000 ) compute_speedup( result, formats ) print( "Writting to HTML file..." ) Loading src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp +8 −17 Original line number Diff line number Diff line Loading @@ -275,12 +275,8 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee #ifdef HAVE_CUDA const int threads = 128; Index blocks, groupSize; //if (KernelType == CSRLightWithoutAtomic) int neededThreads = threadsPerSegment * ( last - first ); //else // neededThreads = rows * (threadsPerSegment > 32 ? 32 : threadsPerSegment); /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if( TNL::Cuda::getMaxGridXSize() * threads >= neededThreads) Loading Loading @@ -334,30 +330,25 @@ void CSRLightKernel< Index, Device >:: init( const Offsets& offsets ) { //const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); //this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ) ); //TNL::Cuda::getWarpSize() ); const Index segmentsCount = offsets.getSize() - 1; //const Index threads = 128; // !!!!!!!!!!!!!!!!!!!!!! block size size_t neededThreads = segmentsCount * 32;//warpSize; Index blocks, threadsPerSegment; const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row if( elementsInSegment <= 2 ) threadsPerSegment = 2; this->threadsPerSegment = 2; else if( elementsInSegment <= 4 ) threadsPerSegment = 4; this->threadsPerSegment = 4; else if( elementsInSegment <= 8 ) threadsPerSegment = 8; this->threadsPerSegment = 8; else if( elementsInSegment <= 16 ) threadsPerSegment = 16; this->threadsPerSegment = 16; else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP) threadsPerSegment = 32; // CSR Vector this->threadsPerSegment = 32; // CSR Vector //else // threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector TNL_ASSERT_GE( threadsPerSegment, 0, "" ); TNL_ASSERT_LE( threadsPerSegment, 33, "" ); TNL_ASSERT_GE( this->threadsPerSegment, 0, "" ); TNL_ASSERT_LE( this->threadsPerSegment, 33, "" ); } Loading Loading
src/Benchmarks/scripts/run-tnl-benchmark-spmv +1 −1 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ BENCHMARK_DBG="tnl-benchmark-spmv-dbg --with-legacy-matrices no" export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf" PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl" source matrix-market M_MATRICES="" #MM_MATRICES="" source florida-matrix-market #FLORIDA_MM_MATRICES="" Loading
src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py +1 −1 Original line number Diff line number Diff line Loading @@ -254,7 +254,7 @@ formats = get_formats( input_df ) multicolumns, df_data = get_multiindex( input_df, formats ) print( "Converting data..." ) result = convert_data_frame( input_df, multicolumns, df_data, 200 ) result = convert_data_frame( input_df, multicolumns, df_data, 20000 ) compute_speedup( result, formats ) print( "Writting to HTML file..." ) Loading
src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp +8 −17 Original line number Diff line number Diff line Loading @@ -275,12 +275,8 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee #ifdef HAVE_CUDA const int threads = 128; Index blocks, groupSize; //if (KernelType == CSRLightWithoutAtomic) int neededThreads = threadsPerSegment * ( last - first ); //else // neededThreads = rows * (threadsPerSegment > 32 ? 32 : threadsPerSegment); /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if( TNL::Cuda::getMaxGridXSize() * threads >= neededThreads) Loading Loading @@ -334,30 +330,25 @@ void CSRLightKernel< Index, Device >:: init( const Offsets& offsets ) { //const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); //this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ) ); //TNL::Cuda::getWarpSize() ); const Index segmentsCount = offsets.getSize() - 1; //const Index threads = 128; // !!!!!!!!!!!!!!!!!!!!!! block size size_t neededThreads = segmentsCount * 32;//warpSize; Index blocks, threadsPerSegment; const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row if( elementsInSegment <= 2 ) threadsPerSegment = 2; this->threadsPerSegment = 2; else if( elementsInSegment <= 4 ) threadsPerSegment = 4; this->threadsPerSegment = 4; else if( elementsInSegment <= 8 ) threadsPerSegment = 8; this->threadsPerSegment = 8; else if( elementsInSegment <= 16 ) threadsPerSegment = 16; this->threadsPerSegment = 16; else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP) threadsPerSegment = 32; // CSR Vector this->threadsPerSegment = 32; // CSR Vector //else // threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector TNL_ASSERT_GE( threadsPerSegment, 0, "" ); TNL_ASSERT_LE( threadsPerSegment, 33, "" ); TNL_ASSERT_GE( this->threadsPerSegment, 0, "" ); TNL_ASSERT_LE( this->threadsPerSegment, 33, "" ); } Loading