From 7f24c2edffe7ba7c98fc7090f511f48d3bccf10c Mon Sep 17 00:00:00 2001 From: Lukas Cejka <lukas.ostatek@gmail.com> Date: Mon, 3 Dec 2018 23:53:30 +0100 Subject: [PATCH] Changed implementation of method. Commiting for backup purposes. --- src/TNL/Matrices/CSR_impl.h | 16 ++----- src/TNL/Matrices/SparseRow_impl.h | 69 ++++++++++++++++--------------- 2 files changed, 40 insertions(+), 45 deletions(-) diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h index 82cc1e82ee..de95c0d78d 100644 --- a/src/TNL/Matrices/CSR_impl.h +++ b/src/TNL/Matrices/CSR_impl.h @@ -135,10 +135,9 @@ template< typename Real, typename Device, typename Index > Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const -{ +{ ConstMatrixRow matrixRow = this->getRow( row ); - IndexType count = matrixRow.getNonZeroElementsCount( TNL::String( Device::getDeviceType() ) ); -// return count; + return matrixRow.getNonZeroElementsCount( TNL::String( Device::getDeviceType() ) ); // getRow() was throwing segmentation faults. // FOR THIS TO WORK, I had to change getRow() from [ rowIndex ] to .getElement( rowIndex ). @@ -459,19 +458,12 @@ typename CSR< Real, Device, Index >::ConstMatrixRow CSR< Real, Device, Index >:: getRow( const IndexType rowIndex ) const { - const IndexType rowOffset = this->rowPointers.getElement( rowIndex ); - const IndexType rowLength = this->rowPointers.getElement( rowIndex + 1 ) - rowOffset; + const IndexType rowOffset = this->rowPointers[ rowIndex ]; + const IndexType rowLength = this->rowPointers[ rowIndex + 1 ] - rowOffset; return ConstMatrixRow( &this->columnIndexes[ rowOffset ], &this->values[ rowOffset ], rowLength, 1 ); - -// const IndexType rowOffset = this->rowPointers[ rowIndex ]; -// const IndexType rowLength = this->rowPointers[ rowIndex + 1 ] - rowOffset; -// return ConstMatrixRow( &this->columnIndexes[ rowOffset ], -// &this->values[ rowOffset ], -// rowLength, -// 1 ); } template< typename Real, diff --git a/src/TNL/Matrices/SparseRow_impl.h b/src/TNL/Matrices/SparseRow_impl.h index f87194df08..14888669dd 100644 --- a/src/TNL/Matrices/SparseRow_impl.h +++ b/src/TNL/Matrices/SparseRow_impl.h @@ -118,9 +118,6 @@ Index SparseRow< Real, Index >:: getNonZeroElementsCount( TNL::String deviceType ) const { - using CudaType = typename TNL::Devices::Cuda; - using HostType = typename TNL::Devices::Host; - using NonConstIndex = typename std::remove_const< Index >::type; // If this is static, it will trigger a illegal memory address @@ -128,41 +125,47 @@ getNonZeroElementsCount( TNL::String deviceType ) const NonConstIndex elementCount ( 0 ); - // elementCount = 0; // Only if it is static. Make sure it is reset. Without this seemingly useless step, it returned incorrect values. +// using CudaType = typename TNL::Devices::Cuda; +// using HostType = typename TNL::Devices::Host; +// +// +// // elementCount = 0; // Only if it is static. Make sure it is reset. Without this seemingly useless step, it returned incorrect values. +// +// // PROBLEM: Lambda function with __cuda_callable__ CANNOT pass values by reference!! +// // INCORRECT ASSUMPTION!! PROBLEM: Lambda function which takes in anything via capture list, cannot return anything. (Maybe dont capture anything? pass this->values by parameter and return count?) +// // WRONG: https://stackoverflow.com/questions/38835154/lambda-function-capture-a-variable-vs-return-value?fbclid=IwAR0ybDD83LRWxkJsrcoSmGW2mbsMfhywmdZQkleqyjU-NOIwqkz8woihfXs +// auto computeNonZeros = [=] __cuda_callable__ ( NonConstIndex i /*, NonConstIndex *elementCount*/ ) mutable +// { +// //std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0/n"; +// if( this->values[ i * step ] != 0.0 ) +// elementCount++;//*elementCount++; +// +// //std::cout << "End of lambda elementCount = " << elementCount << "/n"; +// //return elementCount; +// }; +// +// +// // Decide which ParallelFor will be executed, either Host or Cuda. +// if( deviceType == TNL::String( "Devices::Host" ) ) +// { +// ParallelFor< HostType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /*, &elementCount*/ ); +// } +// +// else if( deviceType == TNL::String( "Cuda" ) ) +// { +// ParallelFor< CudaType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /*, &elementCount*/ ); +// } + - // PROBLEM: Lambda function with __cuda_callable__ CANNOT pass values by reference!! - // PROBLEM: Lambda function which takes in anything via capture list, cannot return anything. (Maybe dont capture anything? pass this->values by parameter and return count?) - auto computeNonZeros = [=] __cuda_callable__ ( NonConstIndex i ) mutable +// // THE FOLLOWING doesn't work on GPU + for( NonConstIndex i = 0; i < length; i++ ) { - //std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0/n"; - if( this->values[ i * step ] != 0.0 ) + std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl; + if( this->values[ i * step ] != 0.0 ) // Returns the same amount of elements in a row as does getRowLength() in ChunkedEllpack. WHY? elementCount++; - - //std::cout << "End of lambda elementCount = " << elementCount << "/n"; - }; - - - // Decide which ParallelFor will be executed, either Host or Cuda. - if( deviceType == TNL::String( "Devices::Host" ) ) - { - ParallelFor< HostType >::exec( ( NonConstIndex ) 0, length, computeNonZeros ); } - else if( deviceType == TNL::String( "Cuda" ) ) - { - ParallelFor< CudaType >::exec( ( NonConstIndex ) 0, length, computeNonZeros ); - } - - -// // THE FOLLOWING doesn't work on GPU -// for( NonConstIndex i = 0; i < length; i++ ) -// { -// std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl; -// if( this->values[ i * step ] != 0.0 ) // Returns the same amount of elements in a row as does getRowLength() in ChunkedEllpack. WHY? -// elementCount++; -// } - - // std::cout << "Element Count = " << elementCount << "\n"; + std::cout << "Element Count = " << elementCount << "\n"; return elementCount; } -- GitLab