Commit 7f24c2ed authored by Lukas Cejka's avatar Lukas Cejka
Browse files

Changed implementation of method. Commiting for backup purposes.

parent 119f487f
Loading
Loading
Loading
Loading
+4 −12
Original line number Diff line number Diff line
@@ -137,8 +137,7 @@ template< typename Real,
Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const
{    
    ConstMatrixRow matrixRow = this->getRow( row );
    IndexType count = matrixRow.getNonZeroElementsCount( TNL::String( Device::getDeviceType() ) );
//    return count;
    return matrixRow.getNonZeroElementsCount( TNL::String( Device::getDeviceType() ) );
    // getRow() was throwing segmentation faults.
    // FOR THIS TO WORK, I had to change getRow() from [ rowIndex ] to .getElement( rowIndex ).
    
@@ -459,19 +458,12 @@ typename CSR< Real, Device, Index >::ConstMatrixRow
CSR< Real, Device, Index >::
getRow( const IndexType rowIndex ) const
{
    const IndexType rowOffset = this->rowPointers.getElement( rowIndex );
    const IndexType rowLength = this->rowPointers.getElement( rowIndex + 1 ) - rowOffset;
    const IndexType rowOffset = this->rowPointers[ rowIndex ];
    const IndexType rowLength = this->rowPointers[ rowIndex + 1 ] - rowOffset;
    return ConstMatrixRow( &this->columnIndexes[ rowOffset ],
                           &this->values[ rowOffset ],
                           rowLength,
                           1 );
    
//   const IndexType rowOffset = this->rowPointers[ rowIndex ];
//   const IndexType rowLength = this->rowPointers[ rowIndex + 1 ] - rowOffset;
//   return ConstMatrixRow( &this->columnIndexes[ rowOffset ],
//                          &this->values[ rowOffset ],
//                          rowLength,
//                          1 );
}

template< typename Real,
+36 −33
Original line number Diff line number Diff line
@@ -118,9 +118,6 @@ Index
SparseRow< Real, Index >::
getNonZeroElementsCount( TNL::String deviceType ) const
{
    using CudaType = typename TNL::Devices::Cuda;
    using HostType = typename TNL::Devices::Host;
    
    using NonConstIndex = typename std::remove_const< Index >::type;
    
    // If this is static, it will trigger a illegal memory address
@@ -128,41 +125,47 @@ getNonZeroElementsCount( TNL::String deviceType ) const
    NonConstIndex elementCount ( 0 );
    
    
    // elementCount = 0; // Only if it is static. Make sure it is reset. Without this seemingly useless step, it returned incorrect values.
    
    // PROBLEM: Lambda function with __cuda_callable__ CANNOT pass values by reference!!
    // PROBLEM: Lambda function which takes in anything via capture list, cannot return anything. (Maybe dont capture anything? pass this->values by parameter and return count?)
    auto computeNonZeros = [=] __cuda_callable__ ( NonConstIndex i ) mutable
    {
        //std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0/n";
        if( this->values[ i * step ] != 0.0 )
            elementCount++;
        
        //std::cout << "End of lambda elementCount = " << elementCount << "/n";
    };
//    using CudaType = typename TNL::Devices::Cuda;
//    using HostType = typename TNL::Devices::Host;
//    
//    
//    // elementCount = 0; // Only if it is static. Make sure it is reset. Without this seemingly useless step, it returned incorrect values.
//    
//    // PROBLEM: Lambda function with __cuda_callable__ CANNOT pass values by reference!!
//    // INCORRECT ASSUMPTION!! PROBLEM: Lambda function which takes in anything via capture list, cannot return anything. (Maybe dont capture anything? pass this->values by parameter and return count?)
//        // WRONG: https://stackoverflow.com/questions/38835154/lambda-function-capture-a-variable-vs-return-value?fbclid=IwAR0ybDD83LRWxkJsrcoSmGW2mbsMfhywmdZQkleqyjU-NOIwqkz8woihfXs
//    auto computeNonZeros = [=] __cuda_callable__ ( NonConstIndex i /*, NonConstIndex *elementCount*/ ) mutable
//    {
//        //std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0/n";
//        if( this->values[ i * step ] != 0.0 )
//            elementCount++;//*elementCount++;
//        
//        //std::cout << "End of lambda elementCount = " << elementCount << "/n";
//        //return elementCount;
//    };
//    
//    
//    // Decide which ParallelFor will be executed, either Host or Cuda.
//    if( deviceType == TNL::String( "Devices::Host" ) )
//    {
//        ParallelFor< HostType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /*, &elementCount*/ );
//    }
//    
//    else if( deviceType == TNL::String( "Cuda" ) )
//    {
//        ParallelFor< CudaType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /*, &elementCount*/ );
//    }
   
    
    // Decide which ParallelFor will be executed, either Host or Cuda.
    if( deviceType == TNL::String( "Devices::Host" ) )
    {
        ParallelFor< HostType >::exec( ( NonConstIndex ) 0, length, computeNonZeros );
    }
    
    else if( deviceType == TNL::String( "Cuda" ) )
//    // THE FOLLOWING doesn't work on GPU
    for( NonConstIndex i = 0; i < length; i++ )
    {
        ParallelFor< CudaType >::exec( ( NonConstIndex ) 0, length, computeNonZeros );
        std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl;
        if( this->values[ i * step ] != 0.0 ) // Returns the same amount of elements in a row as does getRowLength() in ChunkedEllpack. WHY?
            elementCount++;
    }
    
    
//    // THE FOLLOWING doesn't work on GPU
//    for( NonConstIndex i = 0; i < length; i++ )
//    {
//        std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl;
//        if( this->values[ i * step ] != 0.0 ) // Returns the same amount of elements in a row as does getRowLength() in ChunkedEllpack. WHY?
//            elementCount++;
//    }
    
    // std::cout << "Element Count = " << elementCount << "\n";
     std::cout << "Element Count = " << elementCount << "\n";
    
    return elementCount;
}