Commit fd737b29 authored by Matouš Fencl's avatar Matouš Fencl Committed by Jakub Klinkovský
Browse files

Table of calculation added, print and verbose done.

parent c4ea70a8
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@ class Matrix
    void setDimensions( const Index rows, const Index columns );
    
    void setCompressedRowLengths( CompressedRowLengthsVector& rowLengths ){ rowLengths.setValue(this->getNumberOfMatrixElements());}
    
    IndexType getNumberOfMatrixElements(){return this->getNumRows() * this->getNumColumns();}
    /**
    * Classic setElement method.
@@ -85,6 +86,17 @@ class Matrix
    __cuda_callable__
    Index getNumColumns() const{return this->columns;}
   
    Index getNumNonzeros() { 
      Index nonzeros = 0;
      Matrix< Real, TNL::Devices::Host, Index> m;
      m = *this;
      for( int i = 0; i < m.getNumRows(); i++ )
        for( int j = 0; j < m.getNumColumns(); j++ ) 
          if( m.getElement(i,j) != 0 )
            nonzeros++;
      return nonzeros;
    }
    
    /**
    * Function for switching 2 rows in matrix from column further. E.g. in matrix 5 x 5
    * switch rows row1 = 1 and row2 = 2 from column = 1 to columns = 5. 
+44 −68
Original line number Diff line number Diff line
@@ -2,7 +2,7 @@
#include "Matrix/Matrix.h"
#include "gem/GEM.h"
#include "TNL/tnl-dev/src/TNL/Math.h"
//#include "gem/GEMdevice.h"
#include <typeinfo> // type printf
#include <TNL/Matrices/MatrixReader.h>

#include "TNL/tnl-dev/src/TNL/Cuda/CheckDevice.h"
@@ -31,7 +31,7 @@ void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String
template < typename Real, typename Index, typename Device >
void readMatrixVector( Matrix< Real, Device, Index>& matrix, 
        Vector< Real, Device, Index >& vector,
        const String& matrixName,  const String& vectorName );
        const String& matrixName,  const String& vectorName, const int verbose );


template < typename Real,
@@ -46,19 +46,20 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
  MatrixType matrix;
  VectorType vector;
  
  readMatrixVector( matrix, vector, matrixName, vectorName );
  readMatrixVector( matrix, vector, matrixName, vectorName, verbose );
  VectorType vectorResult( matrix.getNumRows() );
  
  // Computation
  double* time;
  time = new double[ loops ];
  double error = -1;
  
  std::cout << "Starting computation on " << device << endl;
  if( verbose > 1 )
    cout << "Starting computation on " << device << endl;
  for( int i = 0; i < loops; i++ )
  {
    MatrixType matrixComp = matrix;
    VectorType vectorComp( vector );
    vectorComp.setValue( 0 );
    vectorResult.setValue( 0 );
    
    GEM< Real, Device, Index > gem( matrixComp, vectorComp );
@@ -68,85 +69,56 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve

    start = std::clock();
    
    if( verbose > 1 )
      cout << "starting computation number " << i+1 << endl;
    gem.solve( vectorResult, pivoting, 0 );
    gem.solve( vectorResult, pivoting, verbose );

    duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
    time[i] = duration;
      
    if( vectorName == "none" )
    {
      double l2norm = 0;
    for( int j = 0; j < matrix.getNumRows(); j++ )
      l2norm += (vectorResult.getElement( i ) - 1)*(vectorResult.getElement( i ) - 1);
      for( int j = 0; j < vectorResult.getSize(); j++ )
        l2norm += (vectorResult.getElement( j ) - 1)*(vectorResult.getElement( j ) - 1);
      l2norm = std::sqrt(l2norm);
    printf( " %.4f ", l2norm);
      error = l2norm;

      if( verbose > 1 )
        printf( "Norm in %d calculation is %.4f \n", i+1, l2norm);
    }
  }
  if( verbose > 1 )
    printf("\n ... done!\n");
  
  delete []time;
  
  return vectorResult;
  
 /*
//#ifdef HAVE_CUDA
  
  
  
  
#ifdef COMPARE_RESULTS
  double error = 0.0;
  //std::cout << "Results:\nrow:"<< setw(20) <<  "Host" << setw(20) <<  "Device" << setw(20) << "Error" << std::endl;
  for( int i = 0; i < matrix.getNumRows(); i++ )
  {
    double errorPom = ( result_vector.getElement(i) - result_vector_dev.getElement(i) ) *
            ( result_vector.getElement(i) - result_vector_dev.getElement(i) );
    //std::cout << i << ": " << setw(20) << result_vector.getElement(i) << setw(20) << result_vector_dev.getElement(i) << setw(20) << errorPom << std::endl;
    error += errorPom;
  }
  double timeMean = 0;
  for( int i = 0; i < loops; i++ )
    timeMean += time[i];
  timeMean /= loops;
    
  error = std::sqrt(error);
  printf("Difference in L2 norm from Device and Host is %.8f\n", error );
   printf("%20s %15d %15d %20s %15s %15s %10d %15.5f %15.5f\n", matrixName.c_str(),
          matrix.getNumRows(), matrix.getNumNonzeros(), vectorName == "none" ? "-":vectorName.c_str(),
          device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, timeMean, error);
  
  double CPUmean(0), GPUmean(0);
  cout << "Timers: " << endl << "CPU: [ ";
  for( int i = 0; i < loops; i++ )
  {
    CPUmean += timeCPU[i];
    GPUmean += timeGPU[i];
    cout << timeCPU[i] << " ";
  }
  CPUmean = CPUmean/loops;
  GPUmean = GPUmean/loops;
  delete []time;
  
  cout << "]" << endl;
  cout << "GPU: [ ";
  for( int i = 0; i < loops; i++ )
    cout << timeGPU[i] << " ";
  cout << "]" << endl;
  cout << "CPU mean time: " <<  CPUmean << endl;
  cout << "GPU mean time: " <<  GPUmean << endl;
#endif
  
#endif
  
  delete []timeCPU;
#ifdef HAVE_CUDA
  delete []timeGPU;
#endif
  return;*/
  return vectorResult;
}

template < typename Real, typename Index, typename Device >
void readMatrixVector( Matrix< Real, Device, Index>& matrix, 
        Vector< Real, Device, Index >& vector,
        const String& matrixName,  const String& vectorName )
        const String& matrixName,  const String& vectorName, const int verbose )
{
  typedef Matrix< Real, Devices::Host, Index> MatrixHost;
  MatrixHost matrixHost;
  // Get matrix
  Matrices::MatrixReader< MatrixHost > m;
  m.readMtxFile( "./test-matrices/" + matrixName, matrixHost );
  if( verbose > 1 )
    cout << "reading matrix " << matrixName << endl;
  //matrixHost.showMatrix();
  if( verbose > 2 )
    matrixHost.showMatrix();
  matrix = matrixHost;
  
  // Get vector
@@ -155,8 +127,13 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix,
  if( vectorName == "none" )
    calculHostVecOne( matrixHost, vectorHost, vectorName );
  else
  {
    readVector( vectorHost, vectorName );
    if( verbose > 1 )
      cout << "reading vector " << vectorName << endl;
  }
  if( verbose > 2 )
    cout << vectorHost << endl;
  vector = vectorHost;
  
}
@@ -174,9 +151,8 @@ void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix,
    }
    vector[i] = pom;
  }
  cout << endl;
  
  ofstream outdata; // outdata is like cin
  /*ofstream outdata; // outdata is like cin
  
  outdata.open("./test-matrices/" + vectorName ); // opens the file
  if( !outdata ) { // file couldn't be opened
@@ -189,7 +165,7 @@ void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix,
    outdata << vector[i] << endl;
  }
  outdata.close();
  cout << endl;
  cout << endl;*/
}

template < typename Real, typename Index >
+13 −12
Original line number Diff line number Diff line
@@ -29,7 +29,6 @@ template< typename Real,
          typename Index >
bool GEM< Real, Device, Index >::solve( Array& x, const TNL::String& pivoting, int verbose )
{
  printf("in solve\n");
  return DeviceDependentCode::SolveGEM( *this, x, pivoting, verbose  );
}

@@ -43,7 +42,7 @@ bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose )
   
   const int n = this->A.getNumRows();

   if( verbose > 1 )
   if( verbose > 2 )
      this->print();

   for( int k = 0; k < n; k++ )
@@ -65,7 +64,7 @@ bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose )
         this->A.setElement( k, j, this->A.getElement( k, j )/pivot );
      this->A.setElement( k, k, 1.0 );
      
      if( verbose > 1 )
      if( verbose > 2 )
      {
         std::cout << "Dividing by the pivot ... " << std::endl;
         this->print();
@@ -82,7 +81,7 @@ bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose )
         this->A.setElement( i, k, 0.0 );
      }

      if( verbose > 1 )
      if( verbose > 2 )
      {
         std::cout << "Subtracting the " << k << "-th row from the rows bellow ... " << std::endl;
         this->print();
@@ -112,13 +111,13 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose )
   
   const int n = this->A.getNumRows();

   if( verbose > 1 )
   if( verbose > 2 )
      this->print();

   for( int k = 0; k < n; k++ )
   {
      if( verbose > 1 )
        std::cout << "Step " << k << "/" << n << "....\n"; //"\r";
        std::cout << "Step " << k << "/" << n << "....\n"; 
      /****
       * Find the pivot - the largest in k-th row
       */
@@ -142,11 +141,11 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose )
         b[ pivotPosition ] = pom;
      }
      
      if( verbose > 0 )
      if( verbose > 1 )
      {
         std::cout << std::endl;
         std::cout << "Choosing element at " << pivotPosition << "-th row as pivot..." << std::endl;
         std::cout << "Swaping " << k << "-th and " << pivotPosition <<  "-th rows ... " << std::endl;
         std::cout << "Swapping " << k << "-th and " << pivotPosition <<  "-th rows ... " << std::endl;
      }
            
      /****
@@ -163,7 +162,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose )
         this->A.setElement( k, j, this->A.getElement( k, j)/pivot );
      this->A.setElement( k, k, 1.0 );
      
      if( verbose > 1 )
      if( verbose > 2 )
      {
         std::cout << "Dividing by the pivot ... " << std::endl;
         this->print();
@@ -180,7 +179,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose )
         this->A.setElement( i, k , 0.0 );
      }

      if( verbose > 1 )
      if( verbose > 2 )
      {
         std::cout << "Subtracting the " << k << "-th row from the rows bellow ... " << std::endl;
         this->print();
@@ -278,7 +277,8 @@ class GEMDeviceDependentCode< TNL::Devices::Host >
    static bool SolveGEM( GEM< Real, DeviceType, Index>& gem,
                   TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose )
    {
      printf("starting computation on CPU\n");
      if( verbose > 1 )
        printf("Starting the computation SolveGEM.\n");
      return pivoting == "yes" ? gem.solveWithPivoting( x, verbose ) : gem.solveWithoutPivoting(x, verbose );
    }
};
@@ -295,7 +295,8 @@ class GEMDeviceDependentCode< TNL::Devices::Cuda >
                   TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose )
    {
#ifdef HAVE_CUDA
      printf("starting computation on GPU\n");
      if( verbose > 1 )
        printf("Starting the computation SolveGEM.\n");
      gem.GEMdevice( x, pivoting, verbose );
#endif
      return true;
+50 −56
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@

template < typename Real,
        typename Index >
void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
                TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, 
                TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& x )
{ 
@@ -30,20 +30,32 @@ void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
template < typename Real,
        typename Device,
        typename Index >
bool GEM<Real, Device, Index >::GEMdevice( /*Matrix< Real, TNL::Devices::Cuda, Index >& this->A, 
                TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, */
                Array& x, const TNL::String& pivoting, int verbose )
bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting, int verbose )
{
  Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( this->A );
  TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector( this->b );
  
  // FOR PIVOTING SET VARIABLES ON DEVICE
  size_t size = sizeof(int);
  int* pivot; cudaMalloc(&pivot, size);
  int* pom = (int*)malloc(size); *pom = -1;
  
  if( verbose > 2 )
  {
    showMatrix<<< 1, 1 >>>( this->A );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    printf("\n");
  }
  
  
  for( int colPointer = 0; colPointer < this->A.getNumColumns(); colPointer++ )
  {
    if( verbose > 1 )
      printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() );
    
    if( pivoting == "yes" )
    {
      // PIVOTING
      int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 512 ? 512 : 
        TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 );  
@@ -61,59 +73,41 @@ bool GEM<Real, Device, Index >::GEMdevice( /*Matrix< Real, TNL::Devices::Cuda, I
      findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot );
      cudaDeviceSynchronize();
      TNL_CHECK_CUDA_DEVICE;
    
    int* pom = (int*)malloc(size); *pom = 0;
      *pom = 0;
      cudaMemcpy( pom, pivot, size, cudaMemcpyDeviceToHost);
    //printf("%d: position of pivot: %d\n", colPointer, *pom);
    }
    
    
    int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
    int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
    int numOfBlocks =  this->A.getNumRows() * numBlocksOnRow;
    //printf( "%d number of threads, %d number of blocks\n", blockSize, numOfBlocks);
    
    if( *pom != -1 && *pom != colPointer )
    
    if( pivoting == "yes" && *pom != -1 && *pom != colPointer )
    {
      if( verbose > 1 )
      {
         std::cout << std::endl;
         std::cout << "Choosing element at " << *pom << "-th row as pivot..." << std::endl;
         std::cout << "Swapping " << colPointer << "-th and " << *pom <<  "-th rows ... " << std::endl;
      }
      swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
      cudaDeviceSynchronize();
      TNL_CHECK_CUDA_DEVICE;
    } 
    
    /*printf("\n");
    showMatrix<<< 1, 1 >>>( this->A );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    printf("\n");*/
    
    GEMColumnUnderDiag<<< numOfBlocks, blockSize >>>( devMat, 
    GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, 
                                                  device_vector.getView(), 
                                                  colPointer, 
                                                  numBlocksOnRow );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    
    
    
    /*int blockSize1 = this->A.getNumRows() > 1024 ? 1024 : this->A.getNumRows();
    int gridSize1 = TNL::roundUpDivision( this->A.getNumRows(), blockSize1 );
    GEMZeroingMainColumn<<< gridSize1, blockSize1 >>>(devMat, 
                                                      device_vector.getView(), 
                                                      colPointer );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    
    printf("\n");
    showMatrix<<< 1, 1 >>>( this->A );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    printf("\n");
    std::cout << device_vector << std::endl;*/
  }
  
  cudaFree(pivot);
  //std::cout << device_vector << std::endl;
  free(pom);
  
  calculateResultSeqCPU( this->A, device_vector, x );
  calculResultVector( this->A, device_vector, x );
  
  return true;
}
+1 −23
Original line number Diff line number Diff line
@@ -6,22 +6,18 @@

template <typename Real >
__inline__ __device__ void warpReduceArgMax(Real& val, int& index) {
  //printf( "index = %d\n", index );
  __syncthreads();
  for (int offset = 32/2; offset > 0; offset /= 2) 
  { 
    Real val1 = __shfl_down_sync( 0xffffffff, val, offset, 32);
    int index1 = __shfl_down_sync( 0xffffffff, index, offset, 32);
    __syncthreads();
    //printf("%d: firstElementInRow = %.4f, %d: val1 = %.4f\n", index, val, index1, val1 );
    if( TNL::abs( val1 )  - TNL::abs( val ) > 0 )
    {
      val = val1;
      index = index1;
      //printf("%d: %.4f\n", index, firstElementInRow  );
    }
    __syncthreads();
    //printf("%d: firstElementInRow = %.4f\n", index, val );
  } 
}

@@ -150,32 +146,15 @@ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A,
  }
}

/*template <typename Real >
__global__ 
void GEMZeroingMainColumn( Matrix< Real, TNL::Devices::Cuda, int >* A,
        TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, 
        int colPointerMain )
{
  //int rowPointer = threadIdx.x + blockDim.x * blockIdx.x;
  //if( rowPointer < A->getNumRows() && rowPointer != colPointerMain )
  //  A->setElement( rowPointer, colPointerMain, 0.0 );
    A->setElement( colPointerMain, colPointer, A->getElement( colPointerMain, colPointer ) / A->getElement( colPointerMain, colPointerMain ) );
  if( colPointer == colPointerMain+1 )
    b[ colPointerMain ] /=  A->getElement( colPointerMain, colPointerMain );
}*/



template <typename Real >
__global__ 
void GEMColumnUnderDiag( Matrix< Real, TNL::Devices::Cuda, int >* A,
void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A,
        TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, 
        int colPointerMain, int numBlocksOnRow )
{
  int rowPointer = blockIdx.x / numBlocksOnRow;
  int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
  /*if( rowPointer == colPointerMain && colPointer == colPointerMain )
    A->setElement( colPointerMain, colPointerMain, 1 );*/
  if( colPointer > colPointerMain && colPointer < A->getNumColumns() && rowPointer != colPointerMain && rowPointer < A->getNumRows() )
  {
    if( A->getElement( colPointerMain, colPointerMain ) != 0 )
@@ -192,7 +171,6 @@ void GEMColumnUnderDiag( Matrix< Real, TNL::Devices::Cuda, int >* A,
  if( rowPointer != colPointerMain && threadIdx.x == 0 && blockIdx.x % numBlocksOnRow == 0 && A->getElement( colPointerMain, colPointerMain ) != 0 && A->getElement( rowPointer, colPointerMain ) != 0  )
  {
    b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * b[ colPointerMain ] / A->getElement( colPointerMain, colPointerMain );
    //A->setElement( rowPointer, colPointerMain, 0.0 );
  }
}

Loading