Commit 23d906d6 authored by Vít Hanousek's avatar Vít Hanousek
Browse files

Add Cuda-Aware-MPI-copy simple sample program to test build system.

parent 63c9f306
Loading
Loading
Loading
Loading
+7 −3
Original line number Diff line number Diff line
@@ -92,15 +92,19 @@ endif()
if( ${CXX_COMPILER_NAME} STREQUAL "mpic++" )
   message( "MPI compiler detected."    )
   set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
   set( CUDA_HOST_COMPILER "mpic++" )

endif()

####
# Check for MPI
# Check for MPI -- not working
#
#find_package( MPI )
#if( MPI_CXX_FOUND )
   # set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
   # message( "MPI headers found -- ${MPI_CXX_INCLUDE_PATH}")
   # message( "MPI link flags  -- ${MPI_CXX_LINK_FLAGS}")
   # message( "MPI libraries-- ${MPI_CXX_LIBRARIES}")
#endif()

#####
+5 −0
Original line number Diff line number Diff line
@@ -28,3 +28,8 @@
	#TARGET_COMPILE_DEFINITIONS( tnlMeshFuncttionEvaluateTestXY PUBLIC "-DDIMENSION=3" )
	#TARGET_COMPILE_DEFINITIONS( tnlMeshFuncttionEvaluateTestXY PUBLIC "-DXDISTR -DYDISTR" )

IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE( mpi-gpu-test ${headers} mpi-gpu.cu )
    #TARGET_LINK_LIBRARIES( mpi-gpu-test ${CPPUNIT_LIBRARIES}
     #                                               tnl )   
ENDIF( BUILD_CUDA )    

tests/mpi/mpi-gpu.cu

0 → 100644
+79 −0
Original line number Diff line number Diff line
#include <iostream>

using namespace std;


#if defined(HAVE_MPI) && defined(HAVE_CUDA)

#include <cuda_runtime.h>
#include <mpi.h>
 
__global__ void SetKernel(float *deviceData, float value)
{
    // Just a dummy kernel
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    deviceData[idx] = value;
}

double sum(float * data, int count)
{
    double sum=0;
    for(int i=0;i<count;i++)
        sum+=data[i];

    return sum;
}


int main(int argc, char **argv)
{
    MPI_Init(&argc, &argv);

    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    int blockSize = 256; //počet threadů v v bloku
    int gridSize = 1000; //počet bloků v gridu -> musí být menší než maxGridSize v cudeGetDeviceProperties

    int dataCount=blockSize*gridSize;

    float * deviceData=NULL;
    cudaMalloc((void **)&deviceData, dataCount * sizeof(float));

    if(rank==0)
    {
        cout << rank<<": "<<"Setup GPU alocated array to 1" << endl;
        SetKernel<<< gridSize,blockSize >>>(deviceData,1.0f);
        cout << rank<<": "<<" Sending GPU data " <<endl;
        MPI_Send((void*)deviceData, dataCount, MPI_FLOAT, 1, 1, MPI_COMM_WORLD);
    }
    
    if(rank==1) 
    {
        cout << rank<<": "<<" Reciving GPU data " <<endl;
        MPI_Recv((void*) deviceData, dataCount, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

        float *data = new float[dataCount];
        cout << rank<<": "<<" Copying data from GPU to CPU " <<endl;
        cudaMemcpy( (void*) data, (void*)deviceData, dataCount*sizeof(float),  cudaMemcpyDeviceToHost);    
        cout << rank<<": "<<" Computin Sum on CPU " <<endl;
        cout << rank<<": "<< "sum:" << sum(data,dataCount) << endl;
        delete [] data;
    }

    cudaFree(deviceData);

    MPI_Finalize();
return 0;
}

#else

int main(void)
{
    cout << "CUDA or MPI missing...." <<endl;
}

#endif