From 119f487f71b588cebb8953d9194190dd624e2cd6 Mon Sep 17 00:00:00 2001 From: Lukas Cejka <lukas.ostatek@gmail.com> Date: Sun, 2 Dec 2018 12:22:46 +0100 Subject: [PATCH] Changed implementation of getting Non-Zero elements. Added problem to comments. Commiting for backup purposes. --- src/TNL/Matrices/SparseRow_impl.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/TNL/Matrices/SparseRow_impl.h b/src/TNL/Matrices/SparseRow_impl.h index 2729e52c81..f87194df08 100644 --- a/src/TNL/Matrices/SparseRow_impl.h +++ b/src/TNL/Matrices/SparseRow_impl.h @@ -122,12 +122,16 @@ getNonZeroElementsCount( TNL::String deviceType ) const using HostType = typename TNL::Devices::Host; using NonConstIndex = typename std::remove_const< Index >::type; -// using DeviceType = typename TNL::Matrices::Matrix::DeviceType; - static NonConstIndex elementCount ( 0 ); + // If this is static, it will trigger a illegal memory address + // How to get it into the lambda function? + NonConstIndex elementCount ( 0 ); - elementCount = 0; // Make sure it is reset. Without this seemingly useless step, it returned incorrect values. + // elementCount = 0; // Only if it is static. Make sure it is reset. Without this seemingly useless step, it returned incorrect values. + + // PROBLEM: Lambda function with __cuda_callable__ CANNOT pass values by reference!! + // PROBLEM: Lambda function which takes in anything via capture list, cannot return anything. (Maybe dont capture anything? pass this->values by parameter and return count?) auto computeNonZeros = [=] __cuda_callable__ ( NonConstIndex i ) mutable { //std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0/n"; @@ -137,9 +141,10 @@ getNonZeroElementsCount( TNL::String deviceType ) const //std::cout << "End of lambda elementCount = " << elementCount << "/n"; }; + + // Decide which ParallelFor will be executed, either Host or Cuda. if( deviceType == TNL::String( "Devices::Host" ) ) { - // Where to end the loop? the variable "length" seems to lead to illegal memory access. ??Because length is the length of the entire row, we want just the length of values.?? ParallelFor< HostType >::exec( ( NonConstIndex ) 0, length, computeNonZeros ); } @@ -147,8 +152,6 @@ getNonZeroElementsCount( TNL::String deviceType ) const { ParallelFor< CudaType >::exec( ( NonConstIndex ) 0, length, computeNonZeros ); } - - // The ParallelFor::exec() function needs a < DeviceType >, how to get this into SparseRow? // // THE FOLLOWING doesn't work on GPU @@ -160,6 +163,7 @@ getNonZeroElementsCount( TNL::String deviceType ) const // } // std::cout << "Element Count = " << elementCount << "\n"; + return elementCount; } -- GitLab