Commit c4ea70a8 authored by Matouš Fencl's avatar Matouš Fencl Committed by Jakub Klinkovský
Browse files

Added matrices for tests. Remade calling of computation as DDC. Made calling from terminal.

parent 23792f0f
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -161,7 +161,7 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix
  matrix.getData( pom );
  if( std::is_same< Device, Device2 >::value )
  {
    printf("copy host to host or device to device\n");
    //printf("copy host to host or device to device\n");
    this->data = pom; 
  }
#ifdef HAVE_CUDA
+212 −0
Original line number Diff line number Diff line

#include "Matrix/Matrix.h"
#include "gem/GEM.h"
#include "TNL/tnl-dev/src/TNL/Math.h"
//#include "gem/GEMdevice.h"
#include <TNL/Matrices/MatrixReader.h>

#include "TNL/tnl-dev/src/TNL/Cuda/CheckDevice.h"
#include "TNL/tnl-dev/src/TNL/Devices/Cuda.h"
#include "TNL/tnl-dev/src/TNL/Containers/Vector.h"
#include "TNL/tnl-dev/src/TNL/Cuda/MemoryHelpers.h"
#include "TNL/tnl-dev/src/TNL/Devices/Host.h"

#define COMPARE_RESULTS true;

// Prihodit GEMdeice do GEM a dělat dělení na device až tam -> přidat parametr pivoting

using namespace TNL;
using namespace TNL::Containers;
using namespace std;


template < typename Real, typename Index >
void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix,
        Vector< Real, Devices::Host, Index >& vector, const String& vectorName );

template < typename Real, typename Index >
void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName );


template < typename Real, typename Index, typename Device >
void readMatrixVector( Matrix< Real, Device, Index>& matrix, 
        Vector< Real, Device, Index >& vector,
        const String& matrixName,  const String& vectorName );


template < typename Real,
        typename Index,
        typename Device >
Vector< Real, Device, Index > runGEM( const String& matrixName, const String& vectorName, const int loops,
        const int verbose, const String& device, const String& pivoting )
{  
  typedef Matrix< Real, Device, Index > MatrixType;
  typedef Vector< Real, Device, Index > VectorType;
  
  MatrixType matrix;
  VectorType vector;
  
  readMatrixVector( matrix, vector, matrixName, vectorName );
  VectorType vectorResult( matrix.getNumRows() );
  
  // Computation
  double* time;
  time = new double[ loops ];
  
  std::cout << "Starting computation on " << device << endl;
  for( int i = 0; i < loops; i++ )
  {
    MatrixType matrixComp = matrix;
    VectorType vectorComp( vector );
    vectorComp.setValue( 0 );
    vectorResult.setValue( 0 );
    
    GEM< Real, Device, Index > gem( matrixComp, vectorComp );

    std::clock_t start;
    double duration;

    start = std::clock();
    
    cout << "starting computation number " << i+1 << endl;
    gem.solve( vectorResult, pivoting, 0 );

    duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
    time[i] = duration;
      
    double l2norm = 0;
    for( int j = 0; j < matrix.getNumRows(); j++ )
      l2norm += (vectorResult.getElement( i ) - 1)*(vectorResult.getElement( i ) - 1);
    l2norm = std::sqrt(l2norm);
    printf( " %.4f ", l2norm);
  }
  printf("\n ... done!\n");
  
  delete []time;
  
  return vectorResult;
  
 /*
//#ifdef HAVE_CUDA
  
  
  
  
#ifdef COMPARE_RESULTS
  double error = 0.0;
  //std::cout << "Results:\nrow:"<< setw(20) <<  "Host" << setw(20) <<  "Device" << setw(20) << "Error" << std::endl;
  for( int i = 0; i < matrix.getNumRows(); i++ )
  {
    double errorPom = ( result_vector.getElement(i) - result_vector_dev.getElement(i) ) *
            ( result_vector.getElement(i) - result_vector_dev.getElement(i) );
    //std::cout << i << ": " << setw(20) << result_vector.getElement(i) << setw(20) << result_vector_dev.getElement(i) << setw(20) << errorPom << std::endl;
    error += errorPom;
  }
  
  error = std::sqrt(error);
  printf("Difference in L2 norm from Device and Host is %.8f\n", error );
  
  double CPUmean(0), GPUmean(0);
  cout << "Timers: " << endl << "CPU: [ ";
  for( int i = 0; i < loops; i++ )
  {
    CPUmean += timeCPU[i];
    GPUmean += timeGPU[i];
    cout << timeCPU[i] << " ";
  }
  CPUmean = CPUmean/loops;
  GPUmean = GPUmean/loops;
  
  cout << "]" << endl;
  cout << "GPU: [ ";
  for( int i = 0; i < loops; i++ )
    cout << timeGPU[i] << " ";
  cout << "]" << endl;
  cout << "CPU mean time: " <<  CPUmean << endl;
  cout << "GPU mean time: " <<  GPUmean << endl;
#endif
  
#endif
  
  delete []timeCPU;
#ifdef HAVE_CUDA
  delete []timeGPU;
#endif
  return;*/
}

template < typename Real, typename Index, typename Device >
void readMatrixVector( Matrix< Real, Device, Index>& matrix, 
        Vector< Real, Device, Index >& vector,
        const String& matrixName,  const String& vectorName )
{
  typedef Matrix< Real, Devices::Host, Index> MatrixHost;
  MatrixHost matrixHost;
  // Get matrix
  Matrices::MatrixReader< MatrixHost > m;
  m.readMtxFile( "./test-matrices/" + matrixName, matrixHost );
  cout << "reading matrix " << matrixName << endl;
  //matrixHost.showMatrix();
  matrix = matrixHost;
  
  // Get vector
  Vector< Real, Devices::Host, Index > vectorHost( matrixHost.getNumRows() );
  
  if( vectorName == "none" )
    calculHostVecOne( matrixHost, vectorHost, vectorName );
  else
    readVector( vectorHost, vectorName );
  cout << "reading vector " << vectorName << endl;
  vector = vectorHost;
  
}

template < typename Real, typename Index >
void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix,
        Vector< Real, Devices::Host, Index >& vector, const String& vectorName )
{
  for( int i = 0; i < matrix.getNumRows(); i++ )
  {
    Real pom = 0;
    for( int j = 0; j < matrix.getNumColumns(); j++ )
    {
      pom += matrix.getElement(i,j);
    }
    vector[i] = pom;
  }
  cout << endl;
  
  ofstream outdata; // outdata is like cin
  
  outdata.open("./test-matrices/" + vectorName ); // opens the file
  if( !outdata ) { // file couldn't be opened
    cerr << "Error: file could not be opened" << endl;
    exit(1);
  }
  
  for( int i = 0; i < vector.getSize(); i++ )
  {
    outdata << vector[i] << endl;
  }
  outdata.close();
  cout << endl;
}

template < typename Real, typename Index >
void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName )
{
  ifstream inFile;
  Real x;
  inFile.open("./test-matrices/" + vectorName );
  if (!inFile) {
      cout << "Unable to open file" << endl;
      return;
  }

  int i = 0;
  while ( inFile >> x ) {
    vector_host[i] = x;
    i++;
  }
  inFile.close();
}
 No newline at end of file
+52 −0
Original line number Diff line number Diff line
@@ -9,26 +9,33 @@

#include <ostream>

template < typename Real = double,
           typename Device = TNL::Devices::Host,
           typename Index = int >
template < typename Device >
class GEMDeviceDependentCode;

template < typename Real,
           typename Device,
           typename Index >
class GEM
{
  public:
    
    typedef Device DeviceType;
    typedef Matrix< Real, Device, Index > MatrixGEM;
    typedef TNL::Containers::Array< Real, Device, Index > Array;
    typedef TNL::Containers::Vector< Real, Device, Index > Array;
    
    
    GEM( MatrixGEM& A,
         Array& b );
    
    bool solve( Array& x, int verbose = 0 );
    bool solve( Array& x, const TNL::String& pivoting, int verbose = 0 );
    
    bool solveWithoutPivoting( Array& x, int verbose = 0 );
    
    bool solveWithPivoting( Array& x, int verbose = 0 );
    
    bool computeLUDecomposition( int verbose = 0 );
    
    bool GEMdevice( Array& x, const TNL::String& pivoting, int verbose );
    
  protected:

    void print( std::ostream& str = std::cout ) const;
@@ -36,6 +43,9 @@ class GEM
    MatrixGEM A;

    Array b;
    
    typedef GEMDeviceDependentCode< DeviceType > DeviceDependentCode;
    friend class GEMDeviceDependentCode< DeviceType >;
};

#include "GEM_impl.h"
+44 −29
Original line number Diff line number Diff line
@@ -13,6 +13,8 @@
#pragma once

#include <TNL/Assert.h>
#include "GEMdevice.h"
#include "GEM.h"
#include <fstream>
  
template< typename Real,
@@ -22,11 +24,20 @@ GEM< Real, Device, Index >::GEM( MatrixGEM& A, Array& b )
: A(A), b(b)
{}

template< typename Real,
          typename Device,
          typename Index >
bool GEM< Real, Device, Index >::solve( Array& x, const TNL::String& pivoting, int verbose )
{
  printf("in solve\n");
  return DeviceDependentCode::SolveGEM( *this, x, pivoting, verbose  );
}


template< typename Real,
          typename Device,
          typename Index >
bool GEM< Real, Device, Index >::solve( Array& x, int verbose )
bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose )
{
   assert( b.getSize() == x.getSize() );
   
@@ -40,6 +51,7 @@ bool GEM< Real, Device, Index >::solve( Array& x, int verbose )
      /****
       * Divide the k-th row by pivot
       */
      if( verbose > 1 )
        if( k % 10 == 0 )
          std::cout << "Elimination: " << k << "/" << n << std::endl;
      const Real& pivot = this->A.getElement( k, k );
@@ -106,7 +118,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose )
   for( int k = 0; k < n; k++ )
   {
      if( verbose > 1 )
        std::cout << "Step " << k << "/" << n << ".... \r";
        std::cout << "Step " << k << "/" << n << "....\n"; //"\r";
      /****
       * Find the pivot - the largest in k-th row
       */
@@ -255,35 +267,38 @@ void GEM< Real, Device, Index >::print( std::ostream& str ) const
      str << " | " << std::setprecision( precision ) << std::setw( precision + 6 ) << b[ row ] << " |" << std::endl;
   }
}
#ifdef HAVE_CUDA
/*template < typename Real, 
           typename Index >*/
__global__ void GEMKernel( /*Matrix< Real, Devices::Cuda, Index >& matrix,
                      Containers::Array< Real, Devices::Cuda, Index >& b, 
                      Containers::Array< Real, Devices::Cuda, Index >& x, 
                      int verbose*/ )

template <>
class GEMDeviceDependentCode< TNL::Devices::Host >
{
  public:
    typedef TNL::Devices::Host DeviceType;
    
    template < typename Real, typename Index>
    static bool SolveGEM( GEM< Real, DeviceType, Index>& gem,
                   TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose )
    {
  unsigned int i = threadIdx.x;// + blockDim.x * blockIdx.x;
  printf( "%d\n" , i );
      printf("starting computation on CPU\n");
      return pivoting == "yes" ? gem.solveWithPivoting( x, verbose ) : gem.solveWithoutPivoting(x, verbose );
    }
};

#endif

template <>
class GEMDeviceDependentCode< TNL::Devices::Cuda >
{
  public:
    typedef TNL::Devices::Cuda DeviceType;
    
/*template< typename Real,
          typename Device,
          typename Index >
bool Matrix< Real, Device, Index >::solveCuda( Array& b, Array& x, int verbose )
    template < typename Real, typename Index>
    static bool SolveGEM( GEM< Real, DeviceType, Index>& gem,
                   TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose )
    {
#ifdef HAVE_CUDA
  Index numRows = this->A.getNumRows();
  Index numColumns = this->A.getNumColumns();
  Index numThreads = 32;
  Index numBlocks = roundUpDivision( numRows, numThreads );
  // TODO: num of blocks needed higher than num of blocks able to calculate
  Devices::Cuda::synchronizeDevice();
  GEMKernel<<< 1, 1 >>>( this, b, x, verbose );
  cudaDeviceSynchronize();
  TNL_CHECK_CUDA_DEVICE;
      printf("starting computation on GPU\n");
      gem.GEMdevice( x, pivoting, verbose );
#endif
}*/
      return true;
    }
};
+44 −53
Original line number Diff line number Diff line
#include "GEMkernels.h"
#include "TNL/Cuda/MemoryHelpers.h"
#define DEBUG 0



#ifdef HAVE_CUDA
#include "GEMkernels.h"

template < typename Real,
        typename Index >
void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev,
void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
                TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, 
                TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& result_vector_dev )
                TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& x )
{ 
  Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrixDev);
  Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrix);
  
  int blockSize = matrixDev.getNumRows() > 1024 ? 1024 : matrixDev.getNumColumns();
  int numBlocksOnColumn = TNL::roundUpDivision( matrixDev.getNumRows(), 1024 );
  int numOfBlocks =  matrixDev.getNumRows() * numBlocksOnColumn;
  int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumColumns();
  int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 );
  int numOfBlocks =  matrix.getNumRows() * numBlocksOnColumn;
    
  
  GEMDiagToResult<<< numOfBlocks, blockSize >>>( devMat,device_vector.getView(),result_vector_dev.getView() );
  GEMDiagToResult<<< numOfBlocks, blockSize >>>( devMat,device_vector.getView(), x.getView() );
  cudaDeviceSynchronize();
  TNL_CHECK_CUDA_DEVICE;
} 
@@ -27,28 +27,27 @@ void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev







template < typename Real,
        typename Device,
        typename Index >
void GEMdevice( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev, 
                TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, 
                TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& result_vector_dev )
bool GEM<Real, Device, Index >::GEMdevice( /*Matrix< Real, TNL::Devices::Cuda, Index >& this->A, 
                TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, */
                Array& x, const TNL::String& pivoting, int verbose )
{
  Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrixDev);
    
  Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( this->A );
  TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector( this->b );
  // FOR PIVOTING SET VARIABLES ON DEVICE
  size_t size = sizeof(int);
  int* pivot; cudaMalloc(&pivot, size);
  
  
  for( int colPointer = 0; colPointer < matrixDev.getNumColumns(); colPointer++ )
  for( int colPointer = 0; colPointer < this->A.getNumColumns(); colPointer++ )
  {
    int reduceBlockSize = (matrixDev.getNumColumns()-colPointer) > 512 ? 512 : 
      TNL::roundToMultiple( matrixDev.getNumColumns()-colPointer, 32 );  
    int reduceGridSize = TNL::roundUpDivision( matrixDev.getNumColumns()-colPointer, reduceBlockSize );
    
    // PIVOTING
    int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 512 ? 512 : 
      TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 );  
    int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize );
    int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );
    
    TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSizeRound);
@@ -68,63 +67,55 @@ void GEMdevice( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev,
    //printf("%d: position of pivot: %d\n", colPointer, *pom);
    
    
    int blockSize = (matrixDev.getNumColumns()-colPointer) > 1024 ? 1024 : matrixDev.getNumColumns();
    int numBlocksOnRow = TNL::roundUpDivision( (matrixDev.getNumColumns()-colPointer), 1024 );
    int numOfBlocks =  matrixDev.getNumRows() * numBlocksOnRow;
    int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
    int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
    int numOfBlocks =  this->A.getNumRows() * numBlocksOnRow;
    //printf( "%d number of threads, %d number of blocks\n", blockSize, numOfBlocks);
    
    if( *pom != -1 && *pom != colPointer )
    {
      swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
    }
    
    /*showMatrix<<< 1, 1 >>>( matrixDev );
      cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;*/
      TNL_CHECK_CUDA_DEVICE;
    } 
    
    /*int* pom = (int*)malloc(size); *pom = 0;
    cudaMemcpy(pivot, pom, size, cudaMemcpyHostToDevice);
    findPivot<<< numBlocksOnRow, 1024 >>>( devMat, colPointer, numBlocksOnRow  );
    /*printf("\n");
    showMatrix<<< 1, 1 >>>( this->A );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    printf("\n");*/
    
    findRowPivot<<< numBlocksOnRow, blockSize >>>( devMat, colPointer, numBlocksOnRow, d_max, pivot );
    GEMColumnUnderDiag<<< numOfBlocks, blockSize >>>( devMat, 
                                                      device_vector.getView(), 
                                                      colPointer, 
                                                      numBlocksOnRow );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    
    
    cudaMemcpy( pom, pivot, size, cudaMemcpyDeviceToHost);
    //printf("%d: position of pivot: %d\n", colPointer, *pom);
    if( *pom != -1 && *pom != colPointer )
    {
      swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
    }*/
    
    
    /*cudaMemcpy( pom, d_max, size, cudaMemcpyDeviceToHost);
    printf("%d\n", *pom );*/
    
    
    
    GEMColumnUnderDiag<<< numOfBlocks, blockSize >>>( devMat, 
    /*int blockSize1 = this->A.getNumRows() > 1024 ? 1024 : this->A.getNumRows();
    int gridSize1 = TNL::roundUpDivision( this->A.getNumRows(), blockSize1 );
    GEMZeroingMainColumn<<< gridSize1, blockSize1 >>>(devMat, 
                                                      device_vector.getView(), 
                                                      colPointer, 
                                                      numBlocksOnRow );
                                                      colPointer );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    /*printf("\n");
    showMatrix<<< 1, 1 >>>( matrixDev );
    
    printf("\n");
    showMatrix<<< 1, 1 >>>( this->A );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    printf("\n");*/
    printf("\n");
    std::cout << device_vector << std::endl;*/
  }
  
  cudaFree(pivot);
  //std::cout << device_vector << std::endl;
  
  calculateResultSeqCPU( matrixDev, device_vector, result_vector_dev );
  
  calculateResultSeqCPU( this->A, device_vector, x );
  
  return true;
}

#endif // HAVE_CUDA
 No newline at end of file
Loading