[mpich-commits] [mpich] MPICH primary repository branch, mpich-master, created. v3.2a2-163-gd9c15cf

Fri Feb 13 14:58:26 CST 2015

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, mpich-master has been created
        at  d9c15cf3a93674c129d94ecb5b2e2b13bc186ff9 (commit)

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/d9c15cf3a93674c129d94ecb5b2e2b13bc186ff9

commit d9c15cf3a93674c129d94ecb5b2e2b13bc186ff9
Author: Sameh Sharkawi <sssharka at us.ibm.com>
Date:   Wed Jan 21 13:30:50 2015 -0500

    PAMID: Initial CUDA support
    
    This is an initial limited implementation for CUDA support. This is not
    performance optimized and only for testing.
    
    (ibm) D202477
    
    Signed-off-by: Su Huang <suhuang at us.ibm.com>

diff --git a/src/mpid/pamid/include/mpidi_datatypes.h b/src/mpid/pamid/include/mpidi_datatypes.h
index 7c6e566..1324187 100644
--- a/src/mpid/pamid/include/mpidi_datatypes.h
+++ b/src/mpid/pamid/include/mpidi_datatypes.h
@@ -38,6 +38,11 @@
 
 #include "opa_primitives.h"
 
+#if CUDA_AWARE_SUPPORT
+#include <cuda_runtime_api.h>
+#endif
+
+
 #if (MPIU_HANDLE_ALLOCATION_METHOD == MPIU_HANDLE_ALLOCATION_THREAD_LOCAL) && defined(__BGQ__)
 struct MPID_Request;
 typedef struct
@@ -105,6 +110,11 @@ typedef struct
 #if QUEUE_BINARY_SEARCH_SUPPORT
   unsigned queue_binary_search_support_on;
 #endif
+
+#if CUDA_AWARE_SUPPORT
+  unsigned cuda_aware_support_on;
+#endif
+
   unsigned verbose;        /**< The current level of verbosity for end-of-job stats. */
   unsigned statistics;     /**< The current level of stats collection.               */
   unsigned rma_pending;    /**< The max num outstanding requests during an RMA op    */
diff --git a/src/mpid/pamid/include/mpidi_platform.h b/src/mpid/pamid/include/mpidi_platform.h
index e5ac1d8..cd95e72 100644
--- a/src/mpid/pamid/include/mpidi_platform.h
+++ b/src/mpid/pamid/include/mpidi_platform.h
@@ -86,6 +86,7 @@ typedef int32_t MPID_Node_id_t;
 #define PAMIX_IS_LOCAL_TASK_SHIFT   (6)
 #define MPIDI_SMP_DETECT_DEFAULT 1
 #define TOKEN_FLOW_CONTROL    0
+#define CUDA_AWARE_SUPPORT    0
 
 /*
  * Enable both the 'internal vs application' and the 'local vs remote'
@@ -142,6 +143,7 @@ static const char _ibm_release_version_[] = "V1R2M0";
 #define MPIDI_NO_ASSERT       1
 #define TOKEN_FLOW_CONTROL    1
 #define DYNAMIC_TASKING       1
+#define CUDA_AWARE_SUPPORT    1
 
 /* 'is local task' extension and limits */
 #define PAMIX_IS_LOCAL_TASK
diff --git a/src/mpid/pamid/include/mpidi_prototypes.h b/src/mpid/pamid/include/mpidi_prototypes.h
index d833637..083699e 100644
--- a/src/mpid/pamid/include/mpidi_prototypes.h
+++ b/src/mpid/pamid/include/mpidi_prototypes.h
@@ -266,6 +266,7 @@ pami_result_t MPIDI_Register_algorithms_ext(void                 *cookie,
                                             size_t               *num_algorithms);
 int MPIDI_collsel_pami_tune_parse_params(int argc, char ** argv);
 void MPIDI_collsel_pami_tune_cleanup();
+inline bool MPIDI_cuda_is_device_buf(const void* ptr);
 void MPIDI_Coll_Comm_create (MPID_Comm *comm);
 void MPIDI_Coll_Comm_destroy(MPID_Comm *comm);
 void MPIDI_Comm_coll_query  (MPID_Comm *comm);
diff --git a/src/mpid/pamid/src/coll/allreduce/mpido_allreduce.c b/src/mpid/pamid/src/coll/allreduce/mpido_allreduce.c
index 5920817..75d2a9c 100644
--- a/src/mpid/pamid/src/coll/allreduce/mpido_allreduce.c
+++ b/src/mpid/pamid/src/coll/allreduce/mpido_allreduce.c
@@ -117,6 +117,22 @@ int MPIDO_Allreduce(const void *sendbuf,
       fprintf(stderr,"Using MPICH allreduce type %u.\n",
               selected_type);
     MPIDI_Update_last_algorithm(comm_ptr, "ALLREDUCE_MPICH");
+#if CUDA_AWARE_SUPPORT
+    if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(sendbuf))
+    {
+       MPI_Aint dt_extent;
+       MPID_Datatype_get_extent_macro(dt, dt_extent);
+       char *buf =  MPIU_Malloc(dt_extent * count);
+       cudaError_t cudaerr = cudaMemcpy(buf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
+       if (cudaSuccess != cudaerr) {
+         fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaerr));
+       }
+       int cuda_res =  MPIR_Allreduce(buf, recvbuf, count, dt, op, comm_ptr, mpierrno);
+       MPIU_Free(buf);
+       return cuda_res;
+    }
+    else 
+#endif
     return MPIR_Allreduce(sendbuf, recvbuf, count, dt, op, comm_ptr, mpierrno);
   }
 
diff --git a/src/mpid/pamid/src/coll/reduce/mpido_reduce.c b/src/mpid/pamid/src/coll/reduce/mpido_reduce.c
index d3ac54b..2aeab7b 100644
--- a/src/mpid/pamid/src/coll/reduce/mpido_reduce.c
+++ b/src/mpid/pamid/src/coll/reduce/mpido_reduce.c
@@ -117,6 +117,22 @@ int MPIDO_Reduce(const void *sendbuf,
    {
       if(unlikely(verbose))
          fprintf(stderr,"Using MPICH reduce algorithm\n");
+#if CUDA_AWARE_SUPPORT
+      if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(sendbuf))
+      {
+         MPI_Aint dt_extent;
+         MPID_Datatype_get_extent_macro(datatype, dt_extent);
+         char *buf =  MPIU_Malloc(dt_extent * count);
+         cudaError_t cudaerr = cudaMemcpy(buf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
+         if (cudaSuccess != cudaerr) {
+           fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaerr));
+         }
+         int cuda_res =  MPIR_Reduce(buf, recvbuf, count, datatype, op, root, comm_ptr, mpierrno); 
+         MPIU_Free(buf);
+         return cuda_res;
+      }
+      else
+#endif
       return MPIR_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, mpierrno);
    }
 
diff --git a/src/mpid/pamid/src/coll/scan/mpido_scan.c b/src/mpid/pamid/src/coll/scan/mpido_scan.c
index 6f33c02..c0100c7 100644
--- a/src/mpid/pamid/src/coll/scan/mpido_scan.c
+++ b/src/mpid/pamid/src/coll/scan/mpido_scan.c
@@ -135,6 +135,26 @@ int MPIDO_Doscan(const void *sendbuf, void *recvbuf,
    {
       if(unlikely(verbose))
          fprintf(stderr,"Using MPICH scan algorithm (exflag %d)\n",exflag);
+#if CUDA_AWARE_SUPPORT
+      if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(sendbuf))
+      {
+         MPI_Aint dt_extent;
+         MPID_Datatype_get_extent_macro(datatype, dt_extent);
+         char *buf =  MPIU_Malloc(dt_extent * count);
+         cudaError_t cudaerr = cudaMemcpy(buf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
+         if (cudaSuccess != cudaerr) {
+           fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaerr));
+         }
+         int cuda_res;
+         if(exflag)
+           cuda_res =  MPIR_Exscan(buf, recvbuf, count, datatype, op, comm_ptr, mpierrno);
+         else
+           cuda_res =  MPIR_Scan(buf, recvbuf, count, datatype, op, comm_ptr, mpierrno);
+         MPIU_Free(buf);
+         return cuda_res;
+      }
+      else
+#endif
       if(exflag)
          return MPIR_Exscan(sendbuf, recvbuf, count, datatype, op, comm_ptr, mpierrno);
       else
diff --git a/src/mpid/pamid/src/mpid_buffer.c b/src/mpid/pamid/src/mpid_buffer.c
index 884b92c..0e2ad5a 100644
--- a/src/mpid/pamid/src/mpid_buffer.c
+++ b/src/mpid/pamid/src/mpid_buffer.c
@@ -57,10 +57,12 @@ void MPIDI_Buffer_copy(
     MPID_Datatype * sdt_ptr;
     MPID_Datatype * rdt_ptr;
 
+    MPI_Aint  sdt_extent;
+    MPI_Aint  rdt_extent;
+
     *smpi_errno = MPI_SUCCESS;
     *rmpi_errno = MPI_SUCCESS;
 
-
     /* printf("bufcopy: src count=%d dt=%d\n", scount, sdt); */
     /* printf("bufcopy: dst count=%d dt=%d\n", rcount, rdt); */
 
@@ -83,11 +85,52 @@ void MPIDI_Buffer_copy(
 
     if (sdt_contig && rdt_contig)
     {
+#if CUDA_AWARE_SUPPORT
+      if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf))
+      {
+        cudaError_t cudaerr = cudaMemcpy(rbuf + rdt_true_lb, sbuf + sdt_true_lb, sdata_sz, cudaMemcpyHostToDevice);
+      }
+      else
+#endif
         memcpy((char*)rbuf + rdt_true_lb, (const char *)sbuf + sdt_true_lb, sdata_sz);
         *rsz = sdata_sz;
     }
     else if (sdt_contig)
     {
+#if CUDA_AWARE_SUPPORT
+      // This will need to be done in two steps:
+      // 1 - Allocate a temp buffer which is the same size as user buffer and unpack in it.
+      // 2 - Copy unpacked data into user buffer from temp buffer.
+      if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf))
+      {
+        MPID_Datatype_get_extent_macro(rdt, rdt_extent);
+        char *buf =  MPIU_Malloc(rdt_extent * rcount);
+        memset(buf, 0, rdt_extent * rcount);        
+        MPID_Segment seg;
+        DLOOP_Offset last;
+
+        MPID_Segment_init(buf, rcount, rdt, &seg, 0);
+        last = sdata_sz;
+        MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb);
+        /* --BEGIN ERROR HANDLING-- */
+        if (last != sdata_sz)
+        {
+            *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0);
+        }
+        /* --END ERROR HANDLING-- */
+
+       *rsz = last;
+
+        
+        cudaError_t cudaerr = cudaMemcpy(rbuf + rdt_true_lb, buf, rdt_extent * rcount, cudaMemcpyHostToDevice);
+
+        MPIU_Free(buf);
+
+        goto fn_exit;
+
+      }
+#endif
+
         MPID_Segment seg;
         DLOOP_Offset last;
 
diff --git a/src/mpid/pamid/src/mpid_init.c b/src/mpid/pamid/src/mpid_init.c
index 52f0b7c..e768119 100644
--- a/src/mpid/pamid/src/mpid_init.c
+++ b/src/mpid/pamid/src/mpid_init.c
@@ -119,6 +119,9 @@ MPIDI_Process_t  MPIDI_Process = {
 #ifdef QUEUE_BINARY_SEARCH_SUPPORT
   .queue_binary_search_support_on = 0,
 #endif
+#if CUDA_AWARE_SUPPORT
+  .cuda_aware_support_on = 0,
+#endif
   .rma_pending           = 1000,
   .shmem_pt2pt           = 1,
   .smp_detect            = MPIDI_SMP_DETECT_DEFAULT,
diff --git a/src/mpid/pamid/src/mpidi_env.c b/src/mpid/pamid/src/mpidi_env.c
index dd1912f..c931858 100644
--- a/src/mpid/pamid/src/mpidi_env.c
+++ b/src/mpid/pamid/src/mpidi_env.c
@@ -1139,6 +1139,12 @@ MPIDI_Env_setup(int rank, int requested)
     char* names[] = {"MP_S_USE_QUEUE_BINARY_SEARCH_SUPPORT", NULL};
     ENV_Char(names, &MPIDI_Process.queue_binary_search_support_on);
 #endif
+
+#if CUDA_AWARE_SUPPORT
+    char* names[] = {"MP_CUDA_AWARE", NULL};
+    ENV_Char(names, &MPIDI_Process.cuda_aware_support_on);
+#endif
+
   /* Exit if any deprecated environment variables were specified. */
   if (found_deprecated_env_var)
     {
diff --git a/src/mpid/pamid/src/mpidi_util.c b/src/mpid/pamid/src/mpidi_util.c
index 774e786..896dd86 100644
--- a/src/mpid/pamid/src/mpidi_util.c
+++ b/src/mpid/pamid/src/mpidi_util.c
@@ -1921,6 +1921,34 @@ void MPIDI_collsel_pami_tune_cleanup()
   MPIDI_collsel_free_advisor_params(&MPIDI_Collsel_advisor_params);
 }
 
+
+
+/**********************************************************/
+/*                 CUDA Utilities                         */
+/**********************************************************/
+
+inline bool MPIDI_cuda_is_device_buf(const void* ptr)
+{
+    bool result = false;
+#if CUDA_AWARE_SUPPORT
+    struct cudaPointerAttributes cuda_attr;
+    cudaError_t e= cudaPointerGetAttributes  ( & cuda_attr, ptr);
+
+    if (e != cudaSuccess)
+        result = false;
+    else if (cuda_attr.memoryType ==  cudaMemoryTypeDevice)
+        result = true;
+    else
+        result = false;
+#endif
+    return result;
+}
+
+
+/**********************************************************/
+/*                End CUDA Utilities                      */
+/**********************************************************/
+
 #if defined(MPID_USE_NODE_IDS)
 #undef FUNCNAME
 #define FUNCNAME MPID_Get_node_id
diff --git a/src/mpid/pamid/src/pt2pt/mpidi_callback_short.c b/src/mpid/pamid/src/pt2pt/mpidi_callback_short.c
index 705d623..1746c45 100644
--- a/src/mpid/pamid/src/pt2pt/mpidi_callback_short.c
+++ b/src/mpid/pamid/src/pt2pt/mpidi_callback_short.c
@@ -195,7 +195,16 @@ MPIDI_RecvShortCB(pami_context_t    context,
   void* rcvbuf = rreq->mpid.userbuf;
 
   if (sndlen > 0)
-    memcpy(rcvbuf, sndbuf, sndlen);
+  {
+#if CUDA_AWARE_SUPPORT
+    if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rcvbuf))
+    {
+      cudaError_t cudaerr = cudaMemcpy(rcvbuf, sndbuf, (size_t)sndlen, cudaMemcpyHostToDevice);
+    }
+    else
+#endif
+      memcpy(rcvbuf, sndbuf, sndlen);
+  }
   TRACE_SET_R_VAL(source,(rreq->mpid.idx),rlen,sndlen);
   TRACE_SET_R_BIT(source,(rreq->mpid.idx),fl.f.comp_in_HH);
   TRACE_SET_R_VAL(source,(rreq->mpid.idx),bufadd,rreq->mpid.userbuf);
diff --git a/src/mpid/pamid/src/pt2pt/mpidi_callback_util.c b/src/mpid/pamid/src/pt2pt/mpidi_callback_util.c
index fa3b48b..c91e7da 100644
--- a/src/mpid/pamid/src/pt2pt/mpidi_callback_util.c
+++ b/src/mpid/pamid/src/pt2pt/mpidi_callback_util.c
@@ -175,7 +175,13 @@ MPIDI_Callback_process_userdefined_dt(pami_context_t      context,
       MPID_assert(rreq->mpid.uebuf    == NULL);
       MPID_assert(rreq->mpid.uebuflen == 0);
       void* rcvbuf = rreq->mpid.userbuf +  dt_true_lb;;
-
+#if CUDA_AWARE_SUPPORT
+    if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rcvbuf))
+    {
+      cudaError_t cudaerr = cudaMemcpy(rcvbuf, sndbuf, (size_t)sndlen, cudaMemcpyHostToDevice);
+    }
+    else
+#endif
       memcpy(rcvbuf, sndbuf, sndlen);
       MPIDI_Request_complete(rreq);
       return;
diff --git a/src/mpid/pamid/src/pt2pt/mpidi_rendezvous.c b/src/mpid/pamid/src/pt2pt/mpidi_rendezvous.c
index ec4569b..d7f0a0c 100644
--- a/src/mpid/pamid/src/pt2pt/mpidi_rendezvous.c
+++ b/src/mpid/pamid/src/pt2pt/mpidi_rendezvous.c
@@ -107,6 +107,16 @@ MPIDI_RendezvousTransfer(pami_context_t   context,
   pami_endpoint_t dest;
   MPIDI_Context_endpoint(rreq, &dest);
 
+#if CUDA_AWARE_SUPPORT
+  if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rcvbuf))
+  {
+    MPIDI_RendezvousTransfer_use_pami_get(context,dest,rcvbuf,rreq);
+  }
+  else
+  {
+#endif
+
+
 #ifdef USE_PAMI_RDMA
   size_t rcvlen_out;
   rc = PAMI_Memregion_create(context,
@@ -153,6 +163,10 @@ MPIDI_RendezvousTransfer(pami_context_t   context,
     }
 #endif
 
+#if CUDA_AWARE_SUPPORT
+  }
+#endif
+
   return PAMI_SUCCESS;
 }
 
diff --git a/src/mpid/pamid/src/pt2pt/mpidi_sendmsg.c b/src/mpid/pamid/src/pt2pt/mpidi_sendmsg.c
index 2086e43..65cf773 100644
--- a/src/mpid/pamid/src/pt2pt/mpidi_sendmsg.c
+++ b/src/mpid/pamid/src/pt2pt/mpidi_sendmsg.c
@@ -345,6 +345,26 @@ MPIDI_SendMsg_process_userdefined_dt(MPID_Request      * sreq,
    */
   else
     {
+      char *buf = NULL;
+#if CUDA_AWARE_SUPPORT
+      // This will need to be done in two steps:
+      // 1 - Allocate a temp buffer which is the same size as user buffer and copy in it.
+      // 2 - Pack data into ue buffer from temp buffer.
+      int on_device =  MPIDI_cuda_is_device_buf(sreq->mpid.userbuf);
+      if(MPIDI_Process.cuda_aware_support_on && on_device)
+      {
+        MPI_Aint dt_extent;
+        MPID_Datatype_get_extent_macro(sreq->mpid.datatype, dt_extent);
+        buf =  MPIU_Malloc(dt_extent * sreq->mpid.userbufcount);
+
+        cudaError_t cudaerr = cudaMemcpy(buf, sreq->mpid.userbuf, dt_extent * sreq->mpid.userbufcount, cudaMemcpyDeviceToHost);
+        if (cudaSuccess != cudaerr) {
+          fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaerr));
+        }
+
+      }
+#endif
+
       MPID_Segment segment;
 
       if(data_sz != 0) {
@@ -359,13 +379,23 @@ MPIDI_SendMsg_process_userdefined_dt(MPID_Request      * sreq,
         sreq->mpid.uebuf_malloc = mpiuMalloc;
 
         DLOOP_Offset last = data_sz;
-        MPID_Segment_init(sreq->mpid.userbuf,
+#if CUDA_AWARE_SUPPORT
+        if(!MPIDI_Process.cuda_aware_support_on || !on_device)
+          buf = sreq->mpid.userbuf;
+#endif
+        MPID_assert(buf != NULL);
+
+        MPID_Segment_init(buf,
                           sreq->mpid.userbufcount,
                           sreq->mpid.datatype,
                           &segment,
                           0);
         MPID_Segment_pack(&segment, 0, &last, sndbuf);
         MPID_assert(last == data_sz);
+#if CUDA_AWARE_SUPPORT
+        if(MPIDI_Process.cuda_aware_support_on && on_device)
+          MPIU_Free(buf);
+#endif
       } else {
 	sndbuf = NULL;
       }

-----------------------------------------------------------------------


hooks/post-receive
-- 
MPICH primary repository