diff --git a/src/TNL/Matrices/CSR.h b/src/TNL/Matrices/CSR.h index 348f01592cf13a7061bc23b70d62e0a9405bfb07..c8f87553ab7bec948186ce4fd6f7ba5c1b9ef9d8 100644 --- a/src/TNL/Matrices/CSR.h +++ b/src/TNL/Matrices/CSR.h @@ -45,17 +45,17 @@ private: public: using RealType = Real; - //typedef Real RealType; - typedef Device DeviceType; - typedef Index IndexType; + using DeviceType = Device; + using IndexType = Index; typedef typename Sparse< RealType, DeviceType, IndexType >:: CompressedRowLengthsVector CompressedRowLengthsVector; typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView; typedef CSR< Real, Device, Index > ThisType; typedef CSR< Real, Devices::Host, Index > HostType; typedef CSR< Real, Devices::Cuda, Index > CudaType; typedef Sparse< Real, Device, Index > BaseType; - typedef typename BaseType::MatrixRow MatrixRow; + //typedef typename BaseType::MatrixRow MatrixRow; + using MatrixRow = typename BaseType::MatrixRow; using ConstMatrixRow = typename BaseType::ConstMatrixRow; //using typename BaseType::ConstMatrixRow; //typedef SparseRow< const RealType, const IndexType > ConstMatrixRow; diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h index e8324de778f67ca6ceb2786c7643d1a8cf6d4687..ad24fc699b1030a8b60892ac1941bcd4e19d04b0 100644 --- a/src/TNL/Matrices/CSR_impl.h +++ b/src/TNL/Matrices/CSR_impl.h @@ -133,16 +133,17 @@ Index CSR< Real, Device, Index >::getRowLengthFast( const IndexType row ) const #ifdef HAVE_CUDA // TODO: move to SparseRow -template< typename MatrixRow > +template< typename MatrixRow, typename Index > __global__ -void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result ) +void getNonZeroRowLengthCudaKernel( const MatrixRow row, Index* result ) { int threadId = blockIdx.x * blockDim.x + threadIdx.x; if( threadId == 0 ) { - result = row->getNonZeroElementsCount(); + *result = row.getNonZeroElementsCount(); } } +#endif template< typename Real, typename Device, @@ -156,34 +157,31 @@ Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) con } if( std::is_same< DeviceType, Devices::Cuda >::value ) { - ConstMatrixRow matrixRow = this->getRow( row ); - IndexType resultHost; + IndexType *cols = new IndexType[4]; + std::cout << "crash1" << std::endl; + RealType *vals = new RealType[4]; + std::cout << "crash2" << std::endl; + for( int i = 0; i < 4; i++ ) + { + cols[i] = i; + vals[i] = 1.0; + } + std::cout << "crash3" << std::endl; + ConstMatrixRow matrixRow(cols, vals, 4, 1); // = this->getRow( row ); // If the program even compiles, this line fails because a segfault is thrown on the first line of getRow() + std::cout << "crash4" << std::endl; + IndexType resultHost ( 0 ); IndexType* resultCuda = Devices::Cuda::passToDevice( resultHost ); - getNonZeroRowLengthCudaKernel<<< 1, 1 >>>( row, &resultCuda ); - resultHost = Devices::Cuda::passFromDevice( resultCuda ); + std::cout << "resultCuda = " << resultCuda << std::endl; + // PROBLEM: If thee second parameter of getNonZeroRowLengthCudaKernel is '&resultCuda', the following issue is thrown: + // 'error: no instance of function template "TNL::Matrices::getNonZeroRowLengthCudaKernel" matches the argument list' + /*TNL::Matrices::*/getNonZeroRowLengthCudaKernel< ConstMatrixRow, IndexType ><<< 1, 1 >>>( matrixRow, resultCuda ); // matrixRow works fine, tested them both separately + std::cout << "resultCuda = " << resultCuda << std::endl; + std::cout << "crash5" << std::endl; + resultHost = Devices::Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address. + std::cout << "crash6" << std::endl; Devices::Cuda::freeFromDevice( resultCuda ); return resultHost; } - - // getRow() was throwing segmentation faults. - // FOR THIS TO WORK, I had to change getRow() from [ rowIndex ] to .getElement( rowIndex ). - - - // THE FOLLOWING throws: /home/lukas/tnl-dev/src/TNL/ParallelFor.h(92): error: identifier "" is undefined in device code -// static IndexType elementCount ( 0 ); -// ConstMatrixRow matrixRow = this->getRow( row ); -// -// elementCount = 0; // Make sure it is reset. Without this seemingly useless step, it returned incorrect values. -// -// auto computeNonZeros = [matrixRow] __cuda_callable__ ( IndexType i ) mutable -// { -// if( matrixRow.getElementValue( i ) != 0.0 ) -// elementCount++; -// }; -// -// ParallelFor< DeviceType >::exec( (IndexType) 0, matrixRow.getLength(), computeNonZeros ); -// -// return elementCount; } template< typename Real, @@ -195,7 +193,6 @@ Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row ) ConstMatrixRow matrixRow = this->getRow( row ); return matrixRow.getNonZeroElementsCount(); } -#endif template< typename Real, typename Device, diff --git a/src/TNL/Matrices/SparseRow.h b/src/TNL/Matrices/SparseRow.h index 6407d4a526303c70124dd03be48d72e9eab0e565..fac855eae71a26cdb4dbf62f927f12d0f24b5af1 100644 --- a/src/TNL/Matrices/SparseRow.h +++ b/src/TNL/Matrices/SparseRow.h @@ -55,9 +55,6 @@ class SparseRow __cuda_callable__ Index getLength() const; -// __global__ -// void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result ); - __cuda_callable__ Index getNonZeroElementsCount() const; diff --git a/src/TNL/Matrices/SparseRow_impl.h b/src/TNL/Matrices/SparseRow_impl.h index 31d133c618a94a9804c79ff45849e46e9c95c77c..3157b6c963515434d1cb1ed478b6125236e2600a 100644 --- a/src/TNL/Matrices/SparseRow_impl.h +++ b/src/TNL/Matrices/SparseRow_impl.h @@ -113,7 +113,8 @@ getLength() const } //template< typename MatrixRow > -//__global__ void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result ) +//__global__ +//void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result ) //{ // int threadId = blockIdx.x * blockDim.x + threadIdx.x; // if( threadId == 0 ) @@ -130,52 +131,16 @@ getNonZeroElementsCount() const { using NonConstIndex = typename std::remove_const< Index >::type; - // If this is static, it will trigger a illegal memory address - // How to get it into the lambda function? NonConstIndex elementCount ( 0 ); - - -// using CudaType = typename TNL::Devices::Cuda; -// using HostType = typename TNL::Devices::Host; -// -// -// // elementCount = 0; // Only if it is static. Make sure it is reset. Without this seemingly useless step, it returned incorrect values. -// -// // PROBLEM: Lambda function with __cuda_callable__ CANNOT pass values by reference!! -// // INCORRECT ASSUMPTION!! PROBLEM: Lambda function which takes in anything via capture list, cannot return anything. (Maybe dont capture anything? pass this->values by parameter and return count?) -// // WRONG: https://stackoverflow.com/questions/38835154/lambda-function-capture-a-variable-vs-return-value?fbclid=IwAR0ybDD83LRWxkJsrcoSmGW2mbsMfhywmdZQkleqyjU-NOIwqkz8woihfXs -// auto computeNonZeros = [=] __cuda_callable__ ( NonConstIndex i /*, NonConstIndex *elementCount*/ ) mutable -// { -// //std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0/n"; -// if( this->values[ i * step ] != 0.0 ) -// elementCount++;//*elementCount++; -// -// //std::cout << "End of lambda elementCount = " << elementCount << "/n"; -// //return elementCount; -// }; -// -// -// // Decide which ParallelFor will be executed, either Host or Cuda. -// if( deviceType == TNL::String( "Devices::Host" ) ) -// { -// ParallelFor< HostType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /*, &elementCount*/ ); -// } -// -// else if( deviceType == TNL::String( "Cuda" ) ) -// { -// ParallelFor< CudaType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /*, &elementCount*/ ); -// } - -// // THE FOLLOWING doesn't work on GPU for( NonConstIndex i = 0; i < length; i++ ) { - std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl; +// std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl; if( this->values[ i * step ] != 0.0 ) // Returns the same amount of elements in a row as does getRowLength() in ChunkedEllpack. WHY? elementCount++; } - std::cout << "Element Count = " << elementCount << "\n"; +// std::cout << "Element Count = " << elementCount << "\n"; return elementCount; }