Commit bd294176 authored by Lukas Cejka's avatar Lukas Cejka
Browse files

Deleted useless code, reformatted present code. Still have issues with...

Deleted useless code, reformatted present code. Still have issues with getNonZeroRowLength. getRow() throws SegFault and so does resultCuda. Commiting for backup purposes.
parent 46652a29
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -45,17 +45,17 @@ private:
public:

   using RealType = Real;
   //typedef Real RealType;
   typedef Device DeviceType;
   typedef Index IndexType;
   using DeviceType = Device;
   using IndexType = Index;
   typedef typename Sparse< RealType, DeviceType, IndexType >:: CompressedRowLengthsVector CompressedRowLengthsVector;
   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
   typedef CSR< Real, Device, Index > ThisType;
   typedef CSR< Real, Devices::Host, Index > HostType;
   typedef CSR< Real, Devices::Cuda, Index > CudaType;
   typedef Sparse< Real, Device, Index > BaseType;
   typedef typename BaseType::MatrixRow MatrixRow;
   //typedef typename BaseType::MatrixRow MatrixRow;
   
   using MatrixRow = typename BaseType::MatrixRow;
   using ConstMatrixRow = typename BaseType::ConstMatrixRow;
   //using typename BaseType::ConstMatrixRow;
   //typedef SparseRow< const RealType, const IndexType > ConstMatrixRow;
+25 −28
Original line number Diff line number Diff line
@@ -133,16 +133,17 @@ Index CSR< Real, Device, Index >::getRowLengthFast( const IndexType row ) const

#ifdef HAVE_CUDA
// TODO: move to SparseRow
template< typename MatrixRow >
template< typename MatrixRow, typename Index >
__global__
void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
void getNonZeroRowLengthCudaKernel( const MatrixRow row, Index* result )
{
   int threadId = blockIdx.x * blockDim.x + threadIdx.x;
   if( threadId == 0 )
   {
      result = row->getNonZeroElementsCount();
      *result = row.getNonZeroElementsCount();
   }
}
#endif

template< typename Real,
          typename Device,
@@ -156,34 +157,31 @@ Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) con
   }
   if( std::is_same< DeviceType, Devices::Cuda >::value )
   {
      ConstMatrixRow matrixRow = this->getRow( row );
      IndexType resultHost;
      IndexType *cols = new IndexType[4];
      std::cout << "crash1" << std::endl;
      RealType *vals = new RealType[4];
      std::cout << "crash2" << std::endl;
      for( int i = 0; i < 4; i++ )
      {
          cols[i] = i;
          vals[i] = 1.0;
      }
      std::cout << "crash3" << std::endl;
      ConstMatrixRow matrixRow(cols, vals, 4, 1); // = this->getRow( row ); // If the program even compiles, this line fails because a segfault is thrown on the first line of getRow()
      std::cout << "crash4" << std::endl;
      IndexType resultHost ( 0 );
      IndexType* resultCuda = Devices::Cuda::passToDevice( resultHost );
      getNonZeroRowLengthCudaKernel<<< 1, 1 >>>( row, &resultCuda );
      resultHost = Devices::Cuda::passFromDevice( resultCuda );
      std::cout << "resultCuda = " << resultCuda << std::endl;
      // PROBLEM: If thee second parameter of getNonZeroRowLengthCudaKernel is '&resultCuda', the following issue is thrown:
      //          'error: no instance of function template "TNL::Matrices::getNonZeroRowLengthCudaKernel" matches the argument list'
      /*TNL::Matrices::*/getNonZeroRowLengthCudaKernel< ConstMatrixRow, IndexType ><<< 1, 1 >>>( matrixRow, resultCuda ); // matrixRow works fine, tested them both separately
      std::cout << "resultCuda = " << resultCuda << std::endl;
      std::cout << "crash5" << std::endl;
      resultHost = Devices::Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address.
      std::cout << "crash6" << std::endl;
      Devices::Cuda::freeFromDevice( resultCuda );
      return resultHost;
   }
   
    // getRow() was throwing segmentation faults.
    // FOR THIS TO WORK, I had to change getRow() from [ rowIndex ] to .getElement( rowIndex ).
    
    
    // THE FOLLOWING throws: /home/lukas/tnl-dev/src/TNL/ParallelFor.h(92): error: identifier "" is undefined in device code
//    static IndexType elementCount ( 0 );
//    ConstMatrixRow matrixRow = this->getRow( row );
//    
//    elementCount = 0; // Make sure it is reset. Without this seemingly useless step, it returned incorrect values.
//    
//    auto computeNonZeros = [matrixRow] __cuda_callable__ ( IndexType i ) mutable
//    {
//        if( matrixRow.getElementValue( i ) != 0.0 )
//            elementCount++;
//    };
//    
//    ParallelFor< DeviceType >::exec( (IndexType) 0, matrixRow.getLength(), computeNonZeros );
//    
//    return elementCount;
}

template< typename Real,
@@ -195,7 +193,6 @@ Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row )
   ConstMatrixRow matrixRow = this->getRow( row );
   return matrixRow.getNonZeroElementsCount();
}
#endif

template< typename Real,
          typename Device,
+0 −3
Original line number Diff line number Diff line
@@ -55,9 +55,6 @@ class SparseRow
      __cuda_callable__
      Index getLength() const;
      
//      __global__ 
//      void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result );
      
      __cuda_callable__
      Index getNonZeroElementsCount() const;

+4 −39
Original line number Diff line number Diff line
@@ -113,7 +113,8 @@ getLength() const
}

//template< typename MatrixRow >
//__global__ void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
//__global__ 
//void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
//{
//   int threadId = blockIdx.x * blockDim.x + threadIdx.x;
//   if( threadId == 0 )
@@ -130,52 +131,16 @@ getNonZeroElementsCount() const
{
    using NonConstIndex = typename std::remove_const< Index >::type;
    
    // If this is static, it will trigger a illegal memory address
    // How to get it into the lambda function?
    NonConstIndex elementCount ( 0 );
   
    
//    using CudaType = typename TNL::Devices::Cuda;
//    using HostType = typename TNL::Devices::Host;
//    
//    
//    // elementCount = 0; // Only if it is static. Make sure it is reset. Without this seemingly useless step, it returned incorrect values.
//    
//    // PROBLEM: Lambda function with __cuda_callable__ CANNOT pass values by reference!!
//    // INCORRECT ASSUMPTION!! PROBLEM: Lambda function which takes in anything via capture list, cannot return anything. (Maybe dont capture anything? pass this->values by parameter and return count?)
//        // WRONG: https://stackoverflow.com/questions/38835154/lambda-function-capture-a-variable-vs-return-value?fbclid=IwAR0ybDD83LRWxkJsrcoSmGW2mbsMfhywmdZQkleqyjU-NOIwqkz8woihfXs
//    auto computeNonZeros = [=] __cuda_callable__ ( NonConstIndex i /*, NonConstIndex *elementCount*/ ) mutable
//    {
//        //std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0/n";
//        if( this->values[ i * step ] != 0.0 )
//            elementCount++;//*elementCount++;
//        
//        //std::cout << "End of lambda elementCount = " << elementCount << "/n";
//        //return elementCount;
//    };
//    
//    
//    // Decide which ParallelFor will be executed, either Host or Cuda.
//    if( deviceType == TNL::String( "Devices::Host" ) )
//    {
//        ParallelFor< HostType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /*, &elementCount*/ );
//    }
//    
//    else if( deviceType == TNL::String( "Cuda" ) )
//    {
//        ParallelFor< CudaType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /*, &elementCount*/ );
//    }
   
    
//    // THE FOLLOWING doesn't work on GPU
    for( NonConstIndex i = 0; i < length; i++ )
    {
        std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl;
//        std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl;
        if( this->values[ i * step ] != 0.0 ) // Returns the same amount of elements in a row as does getRowLength() in ChunkedEllpack. WHY?
            elementCount++;
    }
    
     std::cout << "Element Count = " << elementCount << "\n";
//     std::cout << "Element Count = " << elementCount << "\n";
    
    return elementCount;
}