Loading src/TNL/Matrices/AdEllpack_impl.h +37 −32 Original line number Diff line number Diff line Loading @@ -1131,9 +1131,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector, if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { IndexType end = ( warpIdx + 1 ) << 5; IndexType elementPtr = threadIdx.x + 1; globalIdx++; while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && while( globalIdx < end && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { temp[ threadIdx.x ] += temp[ elementPtr ]; Loading @@ -1158,12 +1159,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) { return; } const int blockSize = 128; Real* temp = Cuda::getSharedMemory< Real >(); Real* temp = Devices::Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; Loading @@ -1178,7 +1177,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, if( warpLoad < 4 ) { // While the helpful index of the warp localLoad is less than localLoad and the element index isn't // out of the matrix (would return the number of cols of the matrix) // out of the matrix (would return the number of columns of the matrix) while( i < warpLoad && this->columnIndexes[ elementPtr ] < this->getColumns() ) { Loading @@ -1191,25 +1190,20 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, else // If the localLoad of the warp is unrollable. { // Is the warpLoad divisible by 4 (4 - 1 for binary AND). // This will return how far it is from being divisble: // This will return how far it is from being divisible: // For 0 & 3 = 0; 1 & 3 = 1; 2 & 3 = 2; 3 & 3 = 3; 4 & 3 = 0, etc. IndexType alignUnroll = warpLoad & 3; // While the result of divisibility by 4 has not reached the closest point where it is divisble by 4. // While the result of divisibility by 4 has not reached the point where it is divisble by 4. while( alignUnroll != 0 && alignUnroll != 4 && this->columnIndexes[ elementPtr ] < this->getColumns() ) { temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; elementPtr += this->warpSize; i++; // If alignUnroll is smaller than or equal to 2, decrement, else increment. // alignUnroll will be from 0, 1, 2, 3, 4 // 0 and 4 means that it is divisible by 4. // That leaves 1, 2, 3: we will decide to go down for alignUnroll <= 2 and up for = 3. // This will ensure that we will get to the closest possible index that is divisible by 4, // since the i index is always incremented, i.e. moved to the correct position for the unroll. alignUnroll <= 2 ? alignUnroll-- : alignUnroll++; // If alignUnroll not 0 (i.e. no. of NNZ elements is not divisible by 4), decrement alignUnroll until it is. // This will ensure that the i starting index with be incremented to the correct starting position for the unroll. alignUnroll--; } } Loading @@ -1231,9 +1225,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, // What is going on here? DOCUMENT if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { IndexType end = ( warpIdx + 1 ) << 5; IndexType elementPtr = threadIdx.x + 1; globalIdx++; while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && while( globalIdx < end && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { temp[ threadIdx.x ] += temp[ elementPtr ]; Loading Loading @@ -1261,7 +1256,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, return; const int blockSize = 128; Real* temp = Cuda::getSharedMemory< Real >(); Real* temp = Devices::Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; Loading @@ -1270,6 +1265,15 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx; const IndexType warpLoad = this->localLoad[ warpIdx ]; // for( IndexType i = 0; i < warpLoad; i++ ) // { // if( this->columnIndexes[ elementPtr ] < this->getColumns() ) // { // temp[ threadIdx.x ] += this->values[ elementPtr] * inVector[ this->columnIndexes[ elementPtr ] ]; // elementPtr += this->warpSize; // } // } if( warpLoad < 8 ) { while( i < warpLoad && Loading @@ -1285,13 +1289,12 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, IndexType alignUnroll = warpLoad & 7; while( alignUnroll != 0 && alignUnroll != 8 && this->columnIndexes[ elementPtr ] < this->getColumns() ) { temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; elementPtr += this->warpSize; i++; alignUnroll <= 4 ? alignUnroll-- : alignUnroll++; alignUnroll--; } } Loading @@ -1310,9 +1313,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { IndexType end = ( warpIdx + 1 ) << 5; IndexType elementPtr = threadIdx.x + 1; globalIdx++; while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && while( globalIdx < end && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { temp[ threadIdx.x ] += temp[ elementPtr ]; Loading Loading @@ -1340,7 +1344,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector, return; const int blockSize = 96; Real* temp = Cuda::getSharedMemory< Real >(); Real* temp = Devices::Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; Loading @@ -1362,14 +1366,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector, else { IndexType alignUnroll = warpLoad & 15; while( alignUnroll != 0 && alignUnroll != 16 && this->columnIndexes[ elementPtr ] < this->getColumns() ) { temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; elementPtr += this->warpSize; i++; alignUnroll <= 8 ? alignUnroll-- : alignUnroll++; alignUnroll--; } } Loading @@ -1388,9 +1392,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector, if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { IndexType end = ( warpIdx + 1 ) << 5; IndexType elementPtr = threadIdx.x + 1; globalIdx++; while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && while( globalIdx < end && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { temp[ threadIdx.x ] += temp[ elementPtr ]; Loading Loading
src/TNL/Matrices/AdEllpack_impl.h +37 −32 Original line number Diff line number Diff line Loading @@ -1131,9 +1131,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector, if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { IndexType end = ( warpIdx + 1 ) << 5; IndexType elementPtr = threadIdx.x + 1; globalIdx++; while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && while( globalIdx < end && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { temp[ threadIdx.x ] += temp[ elementPtr ]; Loading @@ -1158,12 +1159,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) { return; } const int blockSize = 128; Real* temp = Cuda::getSharedMemory< Real >(); Real* temp = Devices::Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; Loading @@ -1178,7 +1177,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, if( warpLoad < 4 ) { // While the helpful index of the warp localLoad is less than localLoad and the element index isn't // out of the matrix (would return the number of cols of the matrix) // out of the matrix (would return the number of columns of the matrix) while( i < warpLoad && this->columnIndexes[ elementPtr ] < this->getColumns() ) { Loading @@ -1191,25 +1190,20 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, else // If the localLoad of the warp is unrollable. { // Is the warpLoad divisible by 4 (4 - 1 for binary AND). // This will return how far it is from being divisble: // This will return how far it is from being divisible: // For 0 & 3 = 0; 1 & 3 = 1; 2 & 3 = 2; 3 & 3 = 3; 4 & 3 = 0, etc. IndexType alignUnroll = warpLoad & 3; // While the result of divisibility by 4 has not reached the closest point where it is divisble by 4. // While the result of divisibility by 4 has not reached the point where it is divisble by 4. while( alignUnroll != 0 && alignUnroll != 4 && this->columnIndexes[ elementPtr ] < this->getColumns() ) { temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; elementPtr += this->warpSize; i++; // If alignUnroll is smaller than or equal to 2, decrement, else increment. // alignUnroll will be from 0, 1, 2, 3, 4 // 0 and 4 means that it is divisible by 4. // That leaves 1, 2, 3: we will decide to go down for alignUnroll <= 2 and up for = 3. // This will ensure that we will get to the closest possible index that is divisible by 4, // since the i index is always incremented, i.e. moved to the correct position for the unroll. alignUnroll <= 2 ? alignUnroll-- : alignUnroll++; // If alignUnroll not 0 (i.e. no. of NNZ elements is not divisible by 4), decrement alignUnroll until it is. // This will ensure that the i starting index with be incremented to the correct starting position for the unroll. alignUnroll--; } } Loading @@ -1231,9 +1225,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, // What is going on here? DOCUMENT if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { IndexType end = ( warpIdx + 1 ) << 5; IndexType elementPtr = threadIdx.x + 1; globalIdx++; while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && while( globalIdx < end && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { temp[ threadIdx.x ] += temp[ elementPtr ]; Loading Loading @@ -1261,7 +1256,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, return; const int blockSize = 128; Real* temp = Cuda::getSharedMemory< Real >(); Real* temp = Devices::Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; Loading @@ -1270,6 +1265,15 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx; const IndexType warpLoad = this->localLoad[ warpIdx ]; // for( IndexType i = 0; i < warpLoad; i++ ) // { // if( this->columnIndexes[ elementPtr ] < this->getColumns() ) // { // temp[ threadIdx.x ] += this->values[ elementPtr] * inVector[ this->columnIndexes[ elementPtr ] ]; // elementPtr += this->warpSize; // } // } if( warpLoad < 8 ) { while( i < warpLoad && Loading @@ -1285,13 +1289,12 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, IndexType alignUnroll = warpLoad & 7; while( alignUnroll != 0 && alignUnroll != 8 && this->columnIndexes[ elementPtr ] < this->getColumns() ) { temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; elementPtr += this->warpSize; i++; alignUnroll <= 4 ? alignUnroll-- : alignUnroll++; alignUnroll--; } } Loading @@ -1310,9 +1313,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { IndexType end = ( warpIdx + 1 ) << 5; IndexType elementPtr = threadIdx.x + 1; globalIdx++; while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && while( globalIdx < end && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { temp[ threadIdx.x ] += temp[ elementPtr ]; Loading Loading @@ -1340,7 +1344,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector, return; const int blockSize = 96; Real* temp = Cuda::getSharedMemory< Real >(); Real* temp = Devices::Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; Loading @@ -1362,14 +1366,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector, else { IndexType alignUnroll = warpLoad & 15; while( alignUnroll != 0 && alignUnroll != 16 && this->columnIndexes[ elementPtr ] < this->getColumns() ) { temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; elementPtr += this->warpSize; i++; alignUnroll <= 8 ? alignUnroll-- : alignUnroll++; alignUnroll--; } } Loading @@ -1388,9 +1392,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector, if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { IndexType end = ( warpIdx + 1 ) << 5; IndexType elementPtr = threadIdx.x + 1; globalIdx++; while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && while( globalIdx < end && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { temp[ threadIdx.x ] += temp[ elementPtr ]; Loading