Loading src/matrix/tnlAdaptiveRgCSRMatrix.h +106 −60 Original line number Diff line number Diff line Loading @@ -29,6 +29,14 @@ using namespace std; struct tnlARGCSRGroupProperties { int numRows; int numUsedThreads; int numRounds; int idxFirstRow; int idxFirstValue; }; //! Matrix storing the non-zero elements in the Row-grouped CSR (Compressed Sparse Row) format /*! Loading Loading @@ -113,19 +121,21 @@ class tnlAdaptiveRgCSRMatrix : public tnlMatrix< Real, Device, Index > */ bool insertBlock( ); tnlLongVector< Real, Device, Index > nonzero_elements; tnlLongVector< Real, Device, Index > nonzeroElements; tnlLongVector< Index, Device, Index > columns; tnlLongVector< Index, Device, Index > block_info; // size 4*number of groups; index of first row in group, nuber of rows in group, tnlLongVector< Index, Device, Index > threads; tnlLongVector< tnlARGCSRGroupProperties, Device, Index > blockInfo; // size 4*number of groups; index of first row in group, nuber of rows in group, // number of rounds, index of first nz element in group tnlLongVector< Index, Device, Index > threads_per_row; Index maxGroupSize, groupSizeStep; Index targetNonzeroesPerGroup; Index number_of_groups; Index numberOfGroups; Index cudaBlockSize; Loading Loading @@ -158,14 +168,14 @@ tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: tnlAdaptiveRgCSRMatrix( const t Index _groupSizeStep, Index _targetNonzeroesPerGroup ) : tnlMatrix< Real, Device, Index >( name ), nonzero_elements( "nonzero-elements" ), nonzeroElements( "nonzero-elements" ), columns( "columns" ), block_info( "block-info" ), threads_per_row( "threads-per-row" ), blockInfo( "block-info" ), threads( "threads-per-row" ), maxGroupSize( _maxGroupSize ), groupSizeStep(_groupSizeStep), targetNonzeroesPerGroup(_targetNonzeroesPerGroup), number_of_groups( 0 ), numberOfGroups( 0 ), cudaBlockSize( 0 ), artificial_zeros( 0 ), last_nonzero_element( 0 ) Loading Loading @@ -214,10 +224,10 @@ template< typename Real, tnlDevice Device, typename Index > bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: setSize( Index new_size ) { this -> size = new_size; if( ! block_info.setSize(this->size) || ! threads_per_row.setSize(this->size) ) if( ! blockInfo.setSize(this->size) || ! threads.setSize(this->size) ) return false; block_info.setValue( 0 ); threads_per_row.setValue( 0 ); blockInfo.setValue( 0 ); threads.setValue( 0 ); last_nonzero_element = 0; return true; }; Loading @@ -226,9 +236,9 @@ template< typename Real, tnlDevice Device, typename Index > bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: setNonzeroElements( Index elements ) { tnlAssert( elements !=0, ); if( ! nonzero_elements.setSize(elements) || ! columns.setSize(elements) ) if( ! nonzeroElements.setSize(elements) || ! columns.setSize(elements) ) return false; nonzero_elements.setValue( 0.0 ); nonzeroElements.setValue( 0.0 ); columns.setValue( -1 ); return true; }; Loading @@ -236,8 +246,8 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: setNonzeroElements( Index template< typename Real, tnlDevice Device, typename Index > Index tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: getNonzeroElements() const { tnlAssert( nonzero_elements. getSize() > artificial_zeros, ); return nonzero_elements. getSize() - artificial_zeros; tnlAssert( nonzeroElements. getSize() > artificial_zeros, ); return nonzeroElements. getSize() - artificial_zeros; } template< typename Real, tnlDevice Device, typename Index > Loading Loading @@ -283,8 +293,10 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlCSRMatr /**** * Now, compute the number of threads per each row */ block_info[4*blkIdx] = blkBegin; // group begining block_info[4*blkIdx+1] = rowsInBlk; // number of the rows in the group //blockInfo[4*blkIdx] = blkBegin; // group begining //blockInfo[4*blkIdx+1] = rowsInBlk; // number of the rows in the group blockInfo[ blkIdx ]. idxFirstValue = blkBegin; blockInfo[ blkIdx ]. numRows = rowsInBlk; uint freeThreads = cudaBlockSize - rowsInBlk; // T -r uint usedThreads = 0; Loading @@ -295,9 +307,9 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlCSRMatr for(uint i=0; i<threadsLeft; i++) { threadsPerRow[i]++; } threads_per_row[blkBegin]=0; threads[blkBegin]=0; for(uint i=blkBegin+1; i<blkEnd; i++) { threads_per_row[i] = threads_per_row[i-1] + threadsPerRow[i-blkBegin-1]; threads[i] = threads[i-1] + threadsPerRow[i-blkBegin-1]; } /**** Loading @@ -308,14 +320,16 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlCSRMatr rounds = ceil(((double) mat.row_offsets[i+1] - mat.row_offsets[i]) / threadsPerRow[i-blkBegin]); roundsFinal = (rounds>roundsFinal) ? rounds : roundsFinal; } block_info[4*blkIdx+2] = roundsFinal; block_info[4*blkIdx+3] = numStoredValues; /*blockInfo[4*blkIdx+2] = roundsFinal; blockInfo[4*blkIdx+3] = numStoredValues;*/ blockInfo[ blkIdx ]. numRounds = roundsFinal; //blockInfo[ blkIdx ]. ????????????????????????????????????? blkIdx++; numStoredValues += cudaBlockSize * roundsFinal; blkBegin = blkEnd; if(blkBegin == this->size) { number_of_groups = blkIdx; numberOfGroups = blkIdx; break; } } Loading @@ -336,27 +350,27 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlCSRMatr uint counters[128]; uint NZperRow[128]; uint index, baseRow; for(uint i=0; i<number_of_groups; i++) { baseRow = block_info[4*i]; index = block_info[4*i+3]; for(uint j=0; j<block_info[4*i+1]; j++) { NZperRow[j] = mat.row_offsets[block_info[4*i]+j+1] - mat.row_offsets[block_info[4*i]+j]; if(j<block_info[4*i+1]-1) { threadsPerRow[j] = threads_per_row[block_info[4*i]+j+1] - threads_per_row[block_info[4*i]+j]; } else threadsPerRow[j] = cudaBlockSize - threads_per_row[block_info[4*i]+j]; for(uint i=0; i<numberOfGroups; i++) { baseRow = blockInfo[4*i]; index = blockInfo[4*i+3]; for(uint j=0; j<blockInfo[4*i+1]; j++) { NZperRow[j] = mat.row_offsets[blockInfo[4*i]+j+1] - mat.row_offsets[blockInfo[4*i]+j]; if(j<blockInfo[4*i+1]-1) { threadsPerRow[j] = threads[blockInfo[4*i]+j+1] - threads[blockInfo[4*i]+j]; } else threadsPerRow[j] = cudaBlockSize - threads[blockInfo[4*i]+j]; counters[j] = 0; } for(uint k=0; k<block_info[4*i+2]; k++) { for(uint j=0; j<block_info[4*i+1]; j++) { for(uint k=0; k<blockInfo[4*i+2]; k++) { for(uint j=0; j<blockInfo[4*i+1]; j++) { for(uint l=0; l<threadsPerRow[j]; l++) { if(counters[j]<NZperRow[j]) { nonzero_elements[index] = mat.nonzero_elements[ mat.row_offsets[baseRow+j]+counters[j] ]; nonzeroElements[index] = mat.nonzeroElements[ mat.row_offsets[baseRow+j]+counters[j] ]; columns[index] = mat.columns[ mat.row_offsets[baseRow+j]+counters[j] ]; } else { columns[index] = -1; nonzero_elements[index] = 0.0; nonzeroElements[index] = 0.0; } counters[j]++; index++; Loading Loading @@ -385,7 +399,7 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlAdaptiv targetNonzeroesPerGroup = adaptiveRgCSRMatrix.targetNonzeroesPerGroup; cudaBlockSize = adaptiveRgCSRMatrix.cudaBlockSize; last_nonzero_element = adaptiveRgCSRMatrix.last_nonzero_element; number_of_groups = adaptiveRgCSRMatrix.number_of_groups; numberOfGroups = adaptiveRgCSRMatrix.numberOfGroups; if( ! this -> setSize( adaptiveRgCSRMatrix. getSize() ) ) Loading @@ -401,15 +415,14 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlAdaptiv return false; artificial_zeros = total_elements - adaptiveRgCSRMatrix. getNonzeroElements(); nonzero_elements = adaptiveRgCSRMatrix.nonzero_elements; nonzeroElements = adaptiveRgCSRMatrix.nonzeroElements; columns = adaptiveRgCSRMatrix.columns; block_info = adaptiveRgCSRMatrix.block_info; threads_per_row = adaptiveRgCSRMatrix.threads_per_row; blockInfo = adaptiveRgCSRMatrix.blockInfo; threads = adaptiveRgCSRMatrix.threads; return true; }; template< typename Real, tnlDevice Device, typename Index > void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLongVector< Real, Device, Index >& vec, tnlLongVector< Real, Device, Index >& result ) const Loading @@ -426,7 +439,40 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo if( Device == tnlHost ) { abort(); int idx[TB_SIZE]; Real psum[TB_SIZE]; //partial sums for each thread int limits[MAX_ROWS + 1]; //indices of first threads for each row + index of first unused thread Real results[MAX_ROWS]; for(int group = 0; group < mat.numberOfGroups; group++) { //for each group of rows for(int thread = 0; thread < mat.blockInfo[group].numUsedThreads; thread++) { //for each used thread in group idx[thread] = mat.blockInfo[group].idxFirstValue + thread; psum[thread] = 0; for(int j = 0; j < mat.blockInfo[group].numRounds; j++) { psum[thread] += mat.nonzeroElements[idx[thread]] * in[mat.columns[idx[thread]]]; idx[j] += TB_SIZE; } } for(int thread = 0; thread < mat.blockInfo[group].numRows; thread++) { //for threads corresponding to rows in group limits[thread] = mat.threads[mat.blockInfo[group].idxFirstRow + thread]; //make a local copy of info about threads } limits[mat.blockInfo[group].numRows] = mat.blockInfo[group].numUsedThreads; //for convenience, add the index of first unused row //reduction of partial sums and writing to the output for(int thread = 0; thread < mat.blockInfo[group].numRows; thread++) { //for threads corresponding to rows in group results[thread] = 0; for(int j = limits[thread]; j < limits[thread+1]; j++) { //sum up partial sums belonging to that row results[thread] += psum[j]; } out[mat.blockInfo[group].idxFirstRow + thread] = results[thread]; } } } if( Device == tnlCuda ) { Loading @@ -435,7 +481,7 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo const Index size = this -> getSize(); Index desGridSize; desGridSize = this->number_of_groups; desGridSize = this->numberOfGroups; desGridSize = (desGridSize < 16384) ? desGridSize : 16384; cudaThreadSynchronize(); Loading @@ -444,11 +490,11 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo AdaptiveRgCSRMatrixVectorProductKernel< Real, Index, false ><<< gridDim, blockDim >>>( result. getVector(), vec. getVector(), nonzero_elements. getVector(), nonzeroElements. getVector(), columns. getVector(), block_info. getVector(), threads_per_row. getVector(), number_of_groups ); blockInfo. getVector(), threads. getVector(), numberOfGroups ); cudaThreadSynchronize(); CHECK_CUDA_ERROR; #else Loading @@ -465,7 +511,7 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo // str << "Structure of tnlAdaptiveRgCSRMatrix" << endl; // str << "Matrix name:" << this -> getName() << endl; // str << "Matrix size:" << this -> getSize() << endl; // str << "Allocated elements:" << nonzero_elements. getSize() << endl; // str << "Allocated elements:" << nonzeroElements. getSize() << endl; // str << "Matrix blocks: " << block_offsets. getSize() << endl; // // Index print_lines = lines; Loading @@ -490,7 +536,7 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo // k < block_offsets. getElement( i + 1 ); // k ++ ) // str << setprecision( 5 ) << setw( 8 ) // << nonzero_elements. getElement( k ) << " "; // << nonzeroElements. getElement( k ) << " "; // str << endl << "Columns: "; // for( Index k = block_offsets. getElement( i ); // k < block_offsets. getElement( i + 1 ); Loading Loading
src/matrix/tnlAdaptiveRgCSRMatrix.h +106 −60 Original line number Diff line number Diff line Loading @@ -29,6 +29,14 @@ using namespace std; struct tnlARGCSRGroupProperties { int numRows; int numUsedThreads; int numRounds; int idxFirstRow; int idxFirstValue; }; //! Matrix storing the non-zero elements in the Row-grouped CSR (Compressed Sparse Row) format /*! Loading Loading @@ -113,19 +121,21 @@ class tnlAdaptiveRgCSRMatrix : public tnlMatrix< Real, Device, Index > */ bool insertBlock( ); tnlLongVector< Real, Device, Index > nonzero_elements; tnlLongVector< Real, Device, Index > nonzeroElements; tnlLongVector< Index, Device, Index > columns; tnlLongVector< Index, Device, Index > block_info; // size 4*number of groups; index of first row in group, nuber of rows in group, tnlLongVector< Index, Device, Index > threads; tnlLongVector< tnlARGCSRGroupProperties, Device, Index > blockInfo; // size 4*number of groups; index of first row in group, nuber of rows in group, // number of rounds, index of first nz element in group tnlLongVector< Index, Device, Index > threads_per_row; Index maxGroupSize, groupSizeStep; Index targetNonzeroesPerGroup; Index number_of_groups; Index numberOfGroups; Index cudaBlockSize; Loading Loading @@ -158,14 +168,14 @@ tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: tnlAdaptiveRgCSRMatrix( const t Index _groupSizeStep, Index _targetNonzeroesPerGroup ) : tnlMatrix< Real, Device, Index >( name ), nonzero_elements( "nonzero-elements" ), nonzeroElements( "nonzero-elements" ), columns( "columns" ), block_info( "block-info" ), threads_per_row( "threads-per-row" ), blockInfo( "block-info" ), threads( "threads-per-row" ), maxGroupSize( _maxGroupSize ), groupSizeStep(_groupSizeStep), targetNonzeroesPerGroup(_targetNonzeroesPerGroup), number_of_groups( 0 ), numberOfGroups( 0 ), cudaBlockSize( 0 ), artificial_zeros( 0 ), last_nonzero_element( 0 ) Loading Loading @@ -214,10 +224,10 @@ template< typename Real, tnlDevice Device, typename Index > bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: setSize( Index new_size ) { this -> size = new_size; if( ! block_info.setSize(this->size) || ! threads_per_row.setSize(this->size) ) if( ! blockInfo.setSize(this->size) || ! threads.setSize(this->size) ) return false; block_info.setValue( 0 ); threads_per_row.setValue( 0 ); blockInfo.setValue( 0 ); threads.setValue( 0 ); last_nonzero_element = 0; return true; }; Loading @@ -226,9 +236,9 @@ template< typename Real, tnlDevice Device, typename Index > bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: setNonzeroElements( Index elements ) { tnlAssert( elements !=0, ); if( ! nonzero_elements.setSize(elements) || ! columns.setSize(elements) ) if( ! nonzeroElements.setSize(elements) || ! columns.setSize(elements) ) return false; nonzero_elements.setValue( 0.0 ); nonzeroElements.setValue( 0.0 ); columns.setValue( -1 ); return true; }; Loading @@ -236,8 +246,8 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: setNonzeroElements( Index template< typename Real, tnlDevice Device, typename Index > Index tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: getNonzeroElements() const { tnlAssert( nonzero_elements. getSize() > artificial_zeros, ); return nonzero_elements. getSize() - artificial_zeros; tnlAssert( nonzeroElements. getSize() > artificial_zeros, ); return nonzeroElements. getSize() - artificial_zeros; } template< typename Real, tnlDevice Device, typename Index > Loading Loading @@ -283,8 +293,10 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlCSRMatr /**** * Now, compute the number of threads per each row */ block_info[4*blkIdx] = blkBegin; // group begining block_info[4*blkIdx+1] = rowsInBlk; // number of the rows in the group //blockInfo[4*blkIdx] = blkBegin; // group begining //blockInfo[4*blkIdx+1] = rowsInBlk; // number of the rows in the group blockInfo[ blkIdx ]. idxFirstValue = blkBegin; blockInfo[ blkIdx ]. numRows = rowsInBlk; uint freeThreads = cudaBlockSize - rowsInBlk; // T -r uint usedThreads = 0; Loading @@ -295,9 +307,9 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlCSRMatr for(uint i=0; i<threadsLeft; i++) { threadsPerRow[i]++; } threads_per_row[blkBegin]=0; threads[blkBegin]=0; for(uint i=blkBegin+1; i<blkEnd; i++) { threads_per_row[i] = threads_per_row[i-1] + threadsPerRow[i-blkBegin-1]; threads[i] = threads[i-1] + threadsPerRow[i-blkBegin-1]; } /**** Loading @@ -308,14 +320,16 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlCSRMatr rounds = ceil(((double) mat.row_offsets[i+1] - mat.row_offsets[i]) / threadsPerRow[i-blkBegin]); roundsFinal = (rounds>roundsFinal) ? rounds : roundsFinal; } block_info[4*blkIdx+2] = roundsFinal; block_info[4*blkIdx+3] = numStoredValues; /*blockInfo[4*blkIdx+2] = roundsFinal; blockInfo[4*blkIdx+3] = numStoredValues;*/ blockInfo[ blkIdx ]. numRounds = roundsFinal; //blockInfo[ blkIdx ]. ????????????????????????????????????? blkIdx++; numStoredValues += cudaBlockSize * roundsFinal; blkBegin = blkEnd; if(blkBegin == this->size) { number_of_groups = blkIdx; numberOfGroups = blkIdx; break; } } Loading @@ -336,27 +350,27 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlCSRMatr uint counters[128]; uint NZperRow[128]; uint index, baseRow; for(uint i=0; i<number_of_groups; i++) { baseRow = block_info[4*i]; index = block_info[4*i+3]; for(uint j=0; j<block_info[4*i+1]; j++) { NZperRow[j] = mat.row_offsets[block_info[4*i]+j+1] - mat.row_offsets[block_info[4*i]+j]; if(j<block_info[4*i+1]-1) { threadsPerRow[j] = threads_per_row[block_info[4*i]+j+1] - threads_per_row[block_info[4*i]+j]; } else threadsPerRow[j] = cudaBlockSize - threads_per_row[block_info[4*i]+j]; for(uint i=0; i<numberOfGroups; i++) { baseRow = blockInfo[4*i]; index = blockInfo[4*i+3]; for(uint j=0; j<blockInfo[4*i+1]; j++) { NZperRow[j] = mat.row_offsets[blockInfo[4*i]+j+1] - mat.row_offsets[blockInfo[4*i]+j]; if(j<blockInfo[4*i+1]-1) { threadsPerRow[j] = threads[blockInfo[4*i]+j+1] - threads[blockInfo[4*i]+j]; } else threadsPerRow[j] = cudaBlockSize - threads[blockInfo[4*i]+j]; counters[j] = 0; } for(uint k=0; k<block_info[4*i+2]; k++) { for(uint j=0; j<block_info[4*i+1]; j++) { for(uint k=0; k<blockInfo[4*i+2]; k++) { for(uint j=0; j<blockInfo[4*i+1]; j++) { for(uint l=0; l<threadsPerRow[j]; l++) { if(counters[j]<NZperRow[j]) { nonzero_elements[index] = mat.nonzero_elements[ mat.row_offsets[baseRow+j]+counters[j] ]; nonzeroElements[index] = mat.nonzeroElements[ mat.row_offsets[baseRow+j]+counters[j] ]; columns[index] = mat.columns[ mat.row_offsets[baseRow+j]+counters[j] ]; } else { columns[index] = -1; nonzero_elements[index] = 0.0; nonzeroElements[index] = 0.0; } counters[j]++; index++; Loading Loading @@ -385,7 +399,7 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlAdaptiv targetNonzeroesPerGroup = adaptiveRgCSRMatrix.targetNonzeroesPerGroup; cudaBlockSize = adaptiveRgCSRMatrix.cudaBlockSize; last_nonzero_element = adaptiveRgCSRMatrix.last_nonzero_element; number_of_groups = adaptiveRgCSRMatrix.number_of_groups; numberOfGroups = adaptiveRgCSRMatrix.numberOfGroups; if( ! this -> setSize( adaptiveRgCSRMatrix. getSize() ) ) Loading @@ -401,15 +415,14 @@ bool tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: copyFrom( const tnlAdaptiv return false; artificial_zeros = total_elements - adaptiveRgCSRMatrix. getNonzeroElements(); nonzero_elements = adaptiveRgCSRMatrix.nonzero_elements; nonzeroElements = adaptiveRgCSRMatrix.nonzeroElements; columns = adaptiveRgCSRMatrix.columns; block_info = adaptiveRgCSRMatrix.block_info; threads_per_row = adaptiveRgCSRMatrix.threads_per_row; blockInfo = adaptiveRgCSRMatrix.blockInfo; threads = adaptiveRgCSRMatrix.threads; return true; }; template< typename Real, tnlDevice Device, typename Index > void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLongVector< Real, Device, Index >& vec, tnlLongVector< Real, Device, Index >& result ) const Loading @@ -426,7 +439,40 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo if( Device == tnlHost ) { abort(); int idx[TB_SIZE]; Real psum[TB_SIZE]; //partial sums for each thread int limits[MAX_ROWS + 1]; //indices of first threads for each row + index of first unused thread Real results[MAX_ROWS]; for(int group = 0; group < mat.numberOfGroups; group++) { //for each group of rows for(int thread = 0; thread < mat.blockInfo[group].numUsedThreads; thread++) { //for each used thread in group idx[thread] = mat.blockInfo[group].idxFirstValue + thread; psum[thread] = 0; for(int j = 0; j < mat.blockInfo[group].numRounds; j++) { psum[thread] += mat.nonzeroElements[idx[thread]] * in[mat.columns[idx[thread]]]; idx[j] += TB_SIZE; } } for(int thread = 0; thread < mat.blockInfo[group].numRows; thread++) { //for threads corresponding to rows in group limits[thread] = mat.threads[mat.blockInfo[group].idxFirstRow + thread]; //make a local copy of info about threads } limits[mat.blockInfo[group].numRows] = mat.blockInfo[group].numUsedThreads; //for convenience, add the index of first unused row //reduction of partial sums and writing to the output for(int thread = 0; thread < mat.blockInfo[group].numRows; thread++) { //for threads corresponding to rows in group results[thread] = 0; for(int j = limits[thread]; j < limits[thread+1]; j++) { //sum up partial sums belonging to that row results[thread] += psum[j]; } out[mat.blockInfo[group].idxFirstRow + thread] = results[thread]; } } } if( Device == tnlCuda ) { Loading @@ -435,7 +481,7 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo const Index size = this -> getSize(); Index desGridSize; desGridSize = this->number_of_groups; desGridSize = this->numberOfGroups; desGridSize = (desGridSize < 16384) ? desGridSize : 16384; cudaThreadSynchronize(); Loading @@ -444,11 +490,11 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo AdaptiveRgCSRMatrixVectorProductKernel< Real, Index, false ><<< gridDim, blockDim >>>( result. getVector(), vec. getVector(), nonzero_elements. getVector(), nonzeroElements. getVector(), columns. getVector(), block_info. getVector(), threads_per_row. getVector(), number_of_groups ); blockInfo. getVector(), threads. getVector(), numberOfGroups ); cudaThreadSynchronize(); CHECK_CUDA_ERROR; #else Loading @@ -465,7 +511,7 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo // str << "Structure of tnlAdaptiveRgCSRMatrix" << endl; // str << "Matrix name:" << this -> getName() << endl; // str << "Matrix size:" << this -> getSize() << endl; // str << "Allocated elements:" << nonzero_elements. getSize() << endl; // str << "Allocated elements:" << nonzeroElements. getSize() << endl; // str << "Matrix blocks: " << block_offsets. getSize() << endl; // // Index print_lines = lines; Loading @@ -490,7 +536,7 @@ void tnlAdaptiveRgCSRMatrix< Real, Device, Index > :: vectorProduct( const tnlLo // k < block_offsets. getElement( i + 1 ); // k ++ ) // str << setprecision( 5 ) << setw( 8 ) // << nonzero_elements. getElement( k ) << " "; // << nonzeroElements. getElement( k ) << " "; // str << endl << "Columns: "; // for( Index k = block_offsets. getElement( i ); // k < block_offsets. getElement( i + 1 ); Loading