Sending/Recieving messages using MPI GPU
I have the simplest of MPI-GPU code. Trying to exchange messages of size 4
on two MPI processes using direct GPU-GPU communication. The code is below
#include <cstdlib>
#include <cstdio>
#include "cuda.h"
#include "cuda_runtime.h"
#include "mpi.h"
#define MESSAGE_SIZE 4
int main(int argc, char **argv)
{
MPI_Init(&argc,&argv);
int nprocs,proc_id;
MPI_Comm_rank(MPI_COMM_WORLD,&proc_id);
MPI_Comm_rank(MPI_COMM_WORLD,&nprocs);
double * send_message = new double[MESSAGE_SIZE];
double * recv_message = new double[MESSAGE_SIZE];
for( int i = 0 ; i < MESSAGE_SIZE ; i++ ){
send_message[i] = (double)rand()*(proc_id+1) / (double)(RAND_MAX - 1);
}
fprintf(stdout,"Proc:%d,SendMessage:%lf,%lf,%lf,%lf\n",proc_id,send_message[0],send_message[1],send_message[2],send_message[3]);
double* send_message_gpu;
double* recv_message_gpu;
cudaMalloc(&send_message_gpu,sizeof(double)*MESSAGE_SIZE);
cudaMemcpy(send_message_gpu,send_message,sizeof(double)*MESSAGE_SIZE,cudaMemcpyHostToDevice);
cudaMalloc(&recv_message_gpu,sizeof(double)*MESSAGE_SIZE);
MPI_Request mpi_send_request;
MPI_Status mpi_send_status,mpi_recv_status;
MPI_Isend(send_message_gpu,MESSAGE_SIZE,MPI_DOUBLE,nprocs-proc_id-1,0,MPI_COMM_WORLD,&mpi_send_request);
MPI_Recv(recv_message_gpu,MESSAGE_SIZE,MPI_DOUBLE,nprocs-proc_id-1,0,MPI_COMM_WORLD,&mpi_recv_status);
MPI_Wait(&mpi_send_request,&mpi_send_status);
cudaDeviceSynchronize();
cudaMemcpy(recv_message,recv_message_gpu,sizeof(double)*MESSAGE_SIZE,cudaMemcpyDeviceToHost);
fprintf(stdout,"Proc:%d,RecvMessage:%lf,%lf,%lf,%lf\n",proc_id,recv_message[0],recv_message[1],recv_message[2],recv_message[3]);
cudaFree(recv_message_gpu);
cudaFree(send_message_gpu);
delete[] send_message;
delete[] recv_message;
return 0;
}
But when i run srun -n 2 ./trial.x (compiled using nvcc cuda5.5 and
mvapich2-1.9), I get the following output
Proc:0,SendMessage:0.840188,0.394383,0.783099,0.798440
Proc:0,RecvMessage:0.000000,0.000000,0.000000,0.000000
Proc:1,SendMessage:1.680375,0.788766,1.566198,1.596880
Proc:1,RecvMessage:0.000000,0.000000,0.000000,0.000000
The message is not recieved. What am I doing wrong here?
No comments:
Post a Comment