[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2b1-66-ge87c158

Service Account noreply at mpich.org
Wed Apr 15 14:52:07 CDT 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  e87c158f7cb9faca29de0b337dc62b7db7cd2946 (commit)
      from  eb0e7712de7e0d01c4c94b71ea88ae1ef6ac9a46 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/e87c158f7cb9faca29de0b337dc62b7db7cd2946

commit e87c158f7cb9faca29de0b337dc62b7db7cd2946
Author: Sameh Sharkawi <sssharka at us.ibm.com>
Date:   Wed Apr 15 14:47:10 2015 -0400

    PAMID: MPI_Allreduce/MPI_Reduce coredump w/ DOUBLE_INT datatype
    
    This commit includes multiple fixes:
     - Fixes for MPI_IN_PLACE checking. cudaGetPointerAttributes returns
       true on MPI_IN_PLACE which causes issues. Now we check on MPI_IN_PLACE
       before passing pointer to cuda.
     - Enabling PAMID geometries (in order to get to PAMID collectives) when
       MP_CUDA_AWARE=yes. This allows for intercepting CUDA buffer.
     - Disabling FCA when MP_CUDA_AWARE=yes if user enables FCA.
     - Copying user recv buffer into temp recv host buffer before collective
       starts, especially in MPI_IN_PLACE cases.
    
    (ibm) D203255
    
    Signed-off-by: Tsai-Yang (Alan) Jea <tjea at us.ibm.com>

diff --git a/src/mpid/pamid/include/mpidi_constants.h b/src/mpid/pamid/include/mpidi_constants.h
index 649af8f..3fcb3b2 100644
--- a/src/mpid/pamid/include/mpidi_constants.h
+++ b/src/mpid/pamid/include/mpidi_constants.h
@@ -81,6 +81,7 @@ enum
    MPID_COLL_OFF = 0,
    MPID_COLL_ON  = 1,
    MPID_COLL_FCA = 2, /* Selecting these is fairly easy so special case */
+   MPID_COLL_CUDA = 3, /* This is used to enable PAMI geometry but sets default to MPICH */
  };
 /** \} */
 
diff --git a/src/mpid/pamid/src/coll/allgather/mpido_allgather.c b/src/mpid/pamid/src/coll/allgather/mpido_allgather.c
index 429cf32..68e658a 100644
--- a/src/mpid/pamid/src/coll/allgather/mpido_allgather.c
+++ b/src/mpid/pamid/src/coll/allgather/mpido_allgather.c
@@ -378,7 +378,14 @@ MPIDO_Allgather(const void *sendbuf,
        if(is_recv_dev_buf)
        {
          rcbuf = MPIU_Malloc(rdt_extent * recvcount);
-         memset(rcbuf, 0, rdt_extent * recvcount);
+         if(sendbuf == MPI_IN_PLACE)
+         {
+           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rdt_extent * recvcount, cudaMemcpyDeviceToHost);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
+         }
+         else
+           memset(rcbuf, 0, rdt_extent * recvcount);
        }
        else
          rcbuf = recvbuf;
diff --git a/src/mpid/pamid/src/coll/allgatherv/mpido_allgatherv.c b/src/mpid/pamid/src/coll/allgatherv/mpido_allgatherv.c
index 7c3121a..c930edf 100644
--- a/src/mpid/pamid/src/coll/allgatherv/mpido_allgatherv.c
+++ b/src/mpid/pamid/src/coll/allgatherv/mpido_allgatherv.c
@@ -405,7 +405,14 @@ MPIDO_Allgatherv(const void *sendbuf,
          }
          rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent;
          rcbuf = MPIU_Malloc(rtotal_buf);
-         memset(rcbuf, 0, rtotal_buf);
+         if(sendbuf == MPI_IN_PLACE)
+         {
+           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
+         }
+         else
+           memset(rcbuf, 0, rtotal_buf);
        }
        else
          rcbuf = recvbuf;
diff --git a/src/mpid/pamid/src/coll/allreduce/mpido_allreduce.c b/src/mpid/pamid/src/coll/allreduce/mpido_allreduce.c
index e7b9222..a569ba1 100644
--- a/src/mpid/pamid/src/coll/allreduce/mpido_allreduce.c
+++ b/src/mpid/pamid/src/coll/allreduce/mpido_allreduce.c
@@ -138,7 +138,14 @@ int MPIDO_Allreduce(const void *sendbuf,
        if(is_recv_dev_buf)
        {
          rcbuf = MPIU_Malloc(dt_extent * count);
-         memset(rcbuf, 0, dt_extent * count);
+         if(sendbuf == MPI_IN_PLACE)
+         {
+           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * count, cudaMemcpyDeviceToHost);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
+         }
+         else
+           memset(rcbuf, 0, dt_extent * count);
        }
        else
          rcbuf = recvbuf;
diff --git a/src/mpid/pamid/src/coll/alltoall/mpido_alltoall.c b/src/mpid/pamid/src/coll/alltoall/mpido_alltoall.c
index fedb2c8..ee25766 100644
--- a/src/mpid/pamid/src/coll/alltoall/mpido_alltoall.c
+++ b/src/mpid/pamid/src/coll/alltoall/mpido_alltoall.c
@@ -113,7 +113,14 @@ int MPIDO_Alltoall(const void *sendbuf,
        if(is_recv_dev_buf)
        {
          rcbuf = MPIU_Malloc(recvcount * rdt_extent);
-         memset(rcbuf, 0, recvcount * rdt_extent);
+         if(sendbuf == MPI_IN_PLACE)
+         {
+           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, recvcount * rdt_extent, cudaMemcpyDeviceToHost);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
+         }
+         else
+           memset(rcbuf, 0, recvcount * rdt_extent);
        }
        else
          rcbuf = recvbuf;
diff --git a/src/mpid/pamid/src/coll/gather/mpido_gather.c b/src/mpid/pamid/src/coll/gather/mpido_gather.c
index cbe9672..13d3332 100644
--- a/src/mpid/pamid/src/coll/gather/mpido_gather.c
+++ b/src/mpid/pamid/src/coll/gather/mpido_gather.c
@@ -213,7 +213,14 @@ int MPIDO_Gather(const void *sendbuf,
        if(is_recv_dev_buf)
        {
          rcbuf = MPIU_Malloc(rdt_extent * recvcount);
-         memset(rcbuf, 0, rdt_extent * recvcount);
+         if(sendbuf == MPI_IN_PLACE)
+         {
+           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rdt_extent * recvcount, cudaMemcpyDeviceToHost);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
+         }
+         else
+           memset(rcbuf, 0, rdt_extent * recvcount);
        }
        else
          rcbuf = recvbuf;
diff --git a/src/mpid/pamid/src/coll/gatherv/mpido_gatherv.c b/src/mpid/pamid/src/coll/gatherv/mpido_gatherv.c
index 834a4e4..4213e95 100644
--- a/src/mpid/pamid/src/coll/gatherv/mpido_gatherv.c
+++ b/src/mpid/pamid/src/coll/gatherv/mpido_gatherv.c
@@ -119,7 +119,14 @@ int MPIDO_Gatherv(const void *sendbuf,
          }
          rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent;
          rcbuf = MPIU_Malloc(rtotal_buf);
-         memset(rcbuf, 0, rtotal_buf);
+         if(sendbuf == MPI_IN_PLACE)
+         {
+           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
+         }
+         else
+           memset(rcbuf, 0, rtotal_buf);
        }
        else
          rcbuf = recvbuf;
diff --git a/src/mpid/pamid/src/coll/red_scat/mpido_red_scat.c b/src/mpid/pamid/src/coll/red_scat/mpido_red_scat.c
index 4dd18c6..55870a8 100644
--- a/src/mpid/pamid/src/coll/red_scat/mpido_red_scat.c
+++ b/src/mpid/pamid/src/coll/red_scat/mpido_red_scat.c
@@ -73,7 +73,14 @@ int MPIDO_Reduce_scatter(const void *sendbuf,
        if(is_recv_dev_buf)
        {
          rcbuf = MPIU_Malloc(total_buf * dt_extent);
-         memset(rcbuf, 0, total_buf * dt_extent);
+         if(sendbuf == MPI_IN_PLACE)
+         {
+           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * total_buf, cudaMemcpyDeviceToHost);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
+         }
+         else
+           memset(rcbuf, 0, total_buf * dt_extent);
        }
        else
          rcbuf = recvbuf;
@@ -131,7 +138,7 @@ int MPIDO_Reduce_scatter_block(const void *sendbuf,
          scbuf = MPIU_Malloc(dt_extent * recvcount * size);
          cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, dt_extent * recvcount * size, cudaMemcpyDeviceToHost);
          if (cudaSuccess != cudaerr) 
-           fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
+           fprintf(stderr, "cudaMemcpy failed: %s recvbuf: %p scbuf: %p is_send_dev_buf: %d is_recv_dev_buf: %p sendbuf: %p\n", CudaGetErrorString(cudaerr), recvbuf, scbuf, is_send_dev_buf,is_recv_dev_buf, sendbuf );
        }
        else
          scbuf = sendbuf;
@@ -139,7 +146,14 @@ int MPIDO_Reduce_scatter_block(const void *sendbuf,
        if(is_recv_dev_buf)
        {
          rcbuf = MPIU_Malloc(dt_extent * recvcount * size);
-         memset(rcbuf, 0, dt_extent * recvcount * size);
+         if(sendbuf == MPI_IN_PLACE)
+         {
+           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * recvcount * size, cudaMemcpyDeviceToHost);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
+         }
+         else
+           memset(rcbuf, 0, recvcount * size * dt_extent);
        }
        else
          rcbuf = recvbuf;
@@ -154,7 +168,7 @@ int MPIDO_Reduce_scatter_block(const void *sendbuf,
        {
          cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, dt_extent * recvcount * size, cudaMemcpyHostToDevice);
          if (cudaSuccess != cudaerr)
-           fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
+           fprintf(stderr, "cudaMemcpy failed: %s recvbuf: %p rcbuf: %p is_send_dev_buf: %d is_recv_dev_buf: %p sendbuf: %p\n", CudaGetErrorString(cudaerr), recvbuf, rcbuf, is_send_dev_buf,is_recv_dev_buf, sendbuf );
          MPIU_Free(rcbuf);
        }
        return cuda_res;
diff --git a/src/mpid/pamid/src/coll/reduce/mpido_reduce.c b/src/mpid/pamid/src/coll/reduce/mpido_reduce.c
index 0c27f07..1ef45fb 100644
--- a/src/mpid/pamid/src/coll/reduce/mpido_reduce.c
+++ b/src/mpid/pamid/src/coll/reduce/mpido_reduce.c
@@ -138,7 +138,14 @@ int MPIDO_Reduce(const void *sendbuf,
          if(is_recv_dev_buf)
          {
            rcbuf = MPIU_Malloc(dt_extent * count);
-           memset(rcbuf, 0, dt_extent * count);
+           if(sendbuf == MPI_IN_PLACE)
+           {
+             cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * count, cudaMemcpyDeviceToHost);
+             if (cudaSuccess != cudaerr)
+               fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
+           }
+           else
+             memset(rcbuf, 0, dt_extent * count);
          }
          else
            rcbuf = recvbuf;
diff --git a/src/mpid/pamid/src/coll/scan/mpido_scan.c b/src/mpid/pamid/src/coll/scan/mpido_scan.c
index 3fbb5f3..40c56ea 100644
--- a/src/mpid/pamid/src/coll/scan/mpido_scan.c
+++ b/src/mpid/pamid/src/coll/scan/mpido_scan.c
@@ -156,7 +156,14 @@ int MPIDO_Doscan(const void *sendbuf, void *recvbuf,
          if(is_recv_dev_buf)
          {
            rcbuf = MPIU_Malloc(dt_extent * count);
-           memset(rcbuf, 0, dt_extent * count);
+           if(sendbuf == MPI_IN_PLACE)
+           {
+           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * count, cudaMemcpyDeviceToHost);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
+           }
+           else
+             memset(rcbuf, 0, dt_extent * count);
          }
          else
            rcbuf = recvbuf;
diff --git a/src/mpid/pamid/src/coll/scatter/mpido_scatter.c b/src/mpid/pamid/src/coll/scatter/mpido_scatter.c
index 6ab1479..e36ea27 100644
--- a/src/mpid/pamid/src/coll/scatter/mpido_scatter.c
+++ b/src/mpid/pamid/src/coll/scatter/mpido_scatter.c
@@ -161,7 +161,7 @@ int MPIDO_Scatter(const void *sendbuf,
        if(is_recv_dev_buf)
        {
          rcbuf = MPIU_Malloc(rdt_extent * recvcount);
-         memset(rcbuf, 0, rdt_extent * recvcount);
+         CudaMemcpy(rcbuf, recvbuf, rdt_extent * recvcount, cudaMemcpyDeviceToHost);
        }
        else
          rcbuf = recvbuf;
diff --git a/src/mpid/pamid/src/coll/scatterv/mpido_scatterv.c b/src/mpid/pamid/src/coll/scatterv/mpido_scatterv.c
index 1487dc1..213b298 100644
--- a/src/mpid/pamid/src/coll/scatterv/mpido_scatterv.c
+++ b/src/mpid/pamid/src/coll/scatterv/mpido_scatterv.c
@@ -281,7 +281,7 @@ int MPIDO_Scatterv(const void *sendbuf,
        if(is_recv_dev_buf)
        {
          rcbuf = MPIU_Malloc(recvcount * rdt_extent);
-         memset(rcbuf, 0, recvcount * rdt_extent);
+         CudaMemcpy(rcbuf, recvbuf, recvcount * rdt_extent, cudaMemcpyDeviceToHost);
        }
        else
          rcbuf = recvbuf;
diff --git a/src/mpid/pamid/src/comm/mpid_selectcolls.c b/src/mpid/pamid/src/comm/mpid_selectcolls.c
index 2efff76..db4ac0c 100644
--- a/src/mpid/pamid/src/comm/mpid_selectcolls.c
+++ b/src/mpid/pamid/src/comm/mpid_selectcolls.c
@@ -257,7 +257,7 @@ void MPIDI_Comm_coll_envvars(MPID_Comm *comm)
       comm->mpid.user_selected_type[i] = MPID_COLL_NOSELECTION;
          if(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_0 && comm->rank == 0)
             fprintf(stderr,"Setting up collective %d on comm %p\n", i, comm);
-	 if((comm->mpid.coll_count[i][0] == 0) && (comm->mpid.coll_count[i][1] == 0))
+	 if(((comm->mpid.coll_count[i][0] == 0) && (comm->mpid.coll_count[i][1] == 0)) || MPIDI_Process.optimized.collectives == MPID_COLL_CUDA)
       {
          comm->mpid.user_selected_type[i] = MPID_COLL_USE_MPICH;
          comm->mpid.user_selected[i] = 0;
diff --git a/src/mpid/pamid/src/mpid_init.c b/src/mpid/pamid/src/mpid_init.c
index a2bc89d..f47cf29 100644
--- a/src/mpid/pamid/src/mpid_init.c
+++ b/src/mpid/pamid/src/mpid_init.c
@@ -641,8 +641,8 @@ void MPIDI_Init_collsel_extension()
     MPIDI_Process.optimized.auto_select_colls = MPID_AUTO_SELECT_COLLS_NONE;
 
 #ifndef __BGQ__
-  //If collective selection will be disabled, check on fca, if both not required, disable pami alltogether
-  if(MPIDI_Process.optimized.auto_select_colls == MPID_AUTO_SELECT_COLLS_NONE && MPIDI_Process.optimized.collectives != MPID_COLL_FCA)
+  //If collective selection will be disabled, check on fca and CUDA if both not required, disable pami alltogether
+  if(MPIDI_Process.optimized.auto_select_colls == MPID_AUTO_SELECT_COLLS_NONE && MPIDI_Process.optimized.collectives != MPID_COLL_FCA && MPIDI_Process.optimized.collectives != MPID_COLL_CUDA)
     MPIDI_Process.optimized.collectives = MPID_COLL_OFF;
 #endif
 }
diff --git a/src/mpid/pamid/src/mpidi_env.c b/src/mpid/pamid/src/mpidi_env.c
index f6934c6..bb924ac 100644
--- a/src/mpid/pamid/src/mpidi_env.c
+++ b/src/mpid/pamid/src/mpidi_env.c
@@ -1156,6 +1156,17 @@ MPIDI_Env_setup(int rank, int requested)
         fprintf(stderr, "Error loading libcudart\n");fflush(stderr);sleep(1);exit(1);
       }
     }
+    else if(MPIDI_Process.cuda_aware_support_on)
+    {
+      if(MPIDI_Process.optimized.collectives == MPID_COLL_FCA)
+        if(rank == 0)
+        {
+          fprintf(stderr, "Warning: FCA is not supported with CUDA Aware support\n");fflush(stderr);
+        }
+
+      MPIDI_Process.optimized.collectives = MPID_COLL_CUDA;
+      MPIDI_Process.optimized.select_colls = 0;
+    }
 #endif
 
   /* Exit if any deprecated environment variables were specified. */
diff --git a/src/mpid/pamid/src/mpidi_util.c b/src/mpid/pamid/src/mpidi_util.c
index fa0b5f4..37ebc8f 100644
--- a/src/mpid/pamid/src/mpidi_util.c
+++ b/src/mpid/pamid/src/mpidi_util.c
@@ -1969,19 +1969,25 @@ inline bool MPIDI_enable_cuda()
 
 inline bool MPIDI_cuda_is_device_buf(const void* ptr)
 {
-    bool result = false;
+  bool result = false;
 #if CUDA_AWARE_SUPPORT
-    struct cudaPointerAttributes cuda_attr;
-    cudaError_t e= CudaPointerGetAttributes  ( & cuda_attr, ptr);
+  if(MPIDI_Process.cuda_aware_support_on)
+  {
+    if(ptr != MPI_IN_PLACE)
+    {
+      struct cudaPointerAttributes cuda_attr;
+      cudaError_t e= CudaPointerGetAttributes  ( & cuda_attr, ptr);
 
-    if (e != cudaSuccess)
-        result = false;
-    else if (cuda_attr.memoryType ==  cudaMemoryTypeDevice)
-        result = true;
-    else
-        result = false;
+      if (e != cudaSuccess)
+          result = false;
+      else if (cuda_attr.memoryType ==  cudaMemoryTypeDevice)
+          result = true;
+      else
+          result = false;
+    }
+  }
 #endif
-    return result;
+  return result;
 }
 
 

-----------------------------------------------------------------------

Summary of changes:
 src/mpid/pamid/include/mpidi_constants.h           |    1 +
 .../pamid/src/coll/allgather/mpido_allgather.c     |    9 ++++++-
 .../pamid/src/coll/allgatherv/mpido_allgatherv.c   |    9 ++++++-
 .../pamid/src/coll/allreduce/mpido_allreduce.c     |    9 ++++++-
 src/mpid/pamid/src/coll/alltoall/mpido_alltoall.c  |    9 ++++++-
 src/mpid/pamid/src/coll/gather/mpido_gather.c      |    9 ++++++-
 src/mpid/pamid/src/coll/gatherv/mpido_gatherv.c    |    9 ++++++-
 src/mpid/pamid/src/coll/red_scat/mpido_red_scat.c  |   22 +++++++++++++---
 src/mpid/pamid/src/coll/reduce/mpido_reduce.c      |    9 ++++++-
 src/mpid/pamid/src/coll/scan/mpido_scan.c          |    9 ++++++-
 src/mpid/pamid/src/coll/scatter/mpido_scatter.c    |    2 +-
 src/mpid/pamid/src/coll/scatterv/mpido_scatterv.c  |    2 +-
 src/mpid/pamid/src/comm/mpid_selectcolls.c         |    2 +-
 src/mpid/pamid/src/mpid_init.c                     |    4 +-
 src/mpid/pamid/src/mpidi_env.c                     |   11 ++++++++
 src/mpid/pamid/src/mpidi_util.c                    |   26 ++++++++++++-------
 16 files changed, 115 insertions(+), 27 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list