[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2b1-6-g59083f6

Service Account noreply at mpich.org
Mon Mar 16 13:54:04 CDT 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  59083f6d2fb37279339e09e8ddb1924bf5e08b27 (commit)
      from  48515bce989495d6952f6d318f0219f807547d9d (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/59083f6d2fb37279339e09e8ddb1924bf5e08b27

commit 59083f6d2fb37279339e09e8ddb1924bf5e08b27
Author: Sameh Sharkawi <sssharka at us.ibm.com>
Date:   Tue Mar 10 13:07:07 2015 -0400

    PAMID: CUDA AWARE support in collectives didn't handle local copy
    
    MPICH collectives use MPIR_Localcopy to move data from src to destination
    buffer for same task. MPIR_Localcopy can't handle GPU buffers. A change
    in MPIR_Localcopy would affect all common code. This change is to handle
    the checking of GPU buffers in the PAMID collectives layer and allocate
    host buffer and copy data from GPU buffer to Host buffer and vice versa so
    MPIR_Localcopy would work w/o issues.
    This is not performance optimized code.
    
    (ibm) D202834
    
    Signed-off-by: Su Huang <suhuang at us.ibm.com>

diff --git a/src/mpid/pamid/include/mpidi_prototypes.h b/src/mpid/pamid/include/mpidi_prototypes.h
index 083699e..1986c30 100644
--- a/src/mpid/pamid/include/mpidi_prototypes.h
+++ b/src/mpid/pamid/include/mpidi_prototypes.h
@@ -417,6 +417,12 @@ int MPIDO_Ialltoallw(const void *sendbuf, const int *sendcounts, const int *send
                      const MPI_Datatype * recvtypes,
                      MPID_Comm *comm_ptr, MPID_Request **request);
 
+int MPIDO_Reduce_scatter(const void *sendbuf, void *recvbuf, int *recvcounts, MPI_Datatype datatype,
+                 MPI_Op op, MPID_Comm *comm_ptr, int *mpierrno);
+
+int MPIDO_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount, 
+                 MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, int *mpierrno);
+
 int MPIDO_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
                                 MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr,
                                 MPID_Request **request);
diff --git a/src/mpid/pamid/src/coll/Makefile.mk b/src/mpid/pamid/src/coll/Makefile.mk
index 33f40aa..adf7131 100644
--- a/src/mpid/pamid/src/coll/Makefile.mk
+++ b/src/mpid/pamid/src/coll/Makefile.mk
@@ -37,6 +37,7 @@ include $(top_srcdir)/src/mpid/pamid/src/coll/alltoallw/Makefile.mk
 include $(top_srcdir)/src/mpid/pamid/src/coll/exscan/Makefile.mk
 include $(top_srcdir)/src/mpid/pamid/src/coll/ired_scat_block/Makefile.mk
 include $(top_srcdir)/src/mpid/pamid/src/coll/ired_scat/Makefile.mk
+include $(top_srcdir)/src/mpid/pamid/src/coll/red_scat/Makefile.mk
 
 mpi_core_sources +=               \
     src/mpid/pamid/src/coll/coll_utils.c
diff --git a/src/mpid/pamid/src/coll/allgather/mpido_allgather.c b/src/mpid/pamid/src/coll/allgather/mpido_allgather.c
index 6f5e4e6..0861fa9 100644
--- a/src/mpid/pamid/src/coll/allgather/mpido_allgather.c
+++ b/src/mpid/pamid/src/coll/allgather/mpido_allgather.c
@@ -356,6 +356,45 @@ MPIDO_Allgather(const void *sendbuf,
        fprintf(stderr,"Using MPICH allgather algorithm\n");
       TRACE_ERR("No options set/available; using MPICH for allgather\n");
       MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH");
+#if CUDA_AWARE_SUPPORT
+    if(MPIDI_Process.cuda_aware_support_on)
+    {
+       MPI_Aint sdt_extent,rdt_extent;
+       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
+       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
+       char *scbuf = NULL;
+       char *rcbuf = NULL;
+       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
+       int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
+       if(is_send_dev_buf)
+       {
+         scbuf = MPIU_Malloc(sdt_extent * sendcount);
+         cudaError_t cudaerr = cudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost);
+         if (cudaSuccess != cudaerr)
+           fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+       }
+       else
+         scbuf = sendbuf;
+       if(is_recv_dev_buf)
+       {
+         rcbuf = MPIU_Malloc(rdt_extent * recvcount);
+         memset(rcbuf, 0, rdt_extent * recvcount);
+       }
+       else
+         rcbuf = recvbuf;
+       int cuda_res =  MPIR_Allgather(scbuf, sendcount, sendtype, rcbuf, recvcount, recvtype, comm_ptr, mpierrno);
+       if(is_send_dev_buf)MPIU_Free(scbuf);
+       if(is_recv_dev_buf)
+         {
+           cudaError_t cudaerr = cudaMemcpy(recvbuf, rcbuf, rdt_extent * recvcount, cudaMemcpyHostToDevice);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+           MPIU_Free(rcbuf);
+         }
+       return cuda_res;
+    }
+    else
+#endif
       return MPIR_Allgather(sendbuf, sendcount, sendtype,
                             recvbuf, recvcount, recvtype,
                             comm_ptr, mpierrno);
diff --git a/src/mpid/pamid/src/coll/allgatherv/mpido_allgatherv.c b/src/mpid/pamid/src/coll/allgatherv/mpido_allgatherv.c
index 0bc3d42..37a767f 100644
--- a/src/mpid/pamid/src/coll/allgatherv/mpido_allgatherv.c
+++ b/src/mpid/pamid/src/coll/allgatherv/mpido_allgatherv.c
@@ -370,6 +370,58 @@ MPIDO_Allgatherv(const void *sendbuf,
              selected_type);
      TRACE_ERR("Using MPICH Allgatherv\n");
      MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH");
+#if CUDA_AWARE_SUPPORT
+    if(MPIDI_Process.cuda_aware_support_on)
+    {
+       MPI_Aint sdt_extent,rdt_extent;
+       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
+       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
+       char *scbuf = NULL;
+       char *rcbuf = NULL;
+       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
+       int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
+       if(is_send_dev_buf)
+       {
+         scbuf = MPIU_Malloc(sdt_extent * sendcount);
+         cudaError_t cudaerr = cudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost);
+         if (cudaSuccess != cudaerr)
+           fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+       }
+       else
+         scbuf = sendbuf;
+       size_t rtotal_buf;
+       if(is_recv_dev_buf)
+       {
+         //Since displs can be non-continous, we need to calculate max buffer size 
+         int highest_displs = displs[size - 1];
+         int highest_recvcount = recvcounts[size - 1];
+         for(i = 0; i < size; i++)
+         {
+           if(displs[i]+recvcounts[i] > highest_displs+highest_recvcount)
+           {
+             highest_displs = displs[i];
+             highest_recvcount = recvcounts[i];
+           }
+         }
+         rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent;
+         rcbuf = MPIU_Malloc(rtotal_buf);
+         memset(rcbuf, 0, rtotal_buf);
+       }
+       else
+         rcbuf = recvbuf;
+       int cuda_res =  MPIR_Allgatherv(scbuf, sendcount, sendtype, rcbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno);
+       if(is_send_dev_buf)MPIU_Free(scbuf);
+       if(is_recv_dev_buf)
+         {
+           cudaError_t cudaerr = cudaMemcpy(recvbuf, rcbuf, rtotal_buf, cudaMemcpyHostToDevice);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+           MPIU_Free(rcbuf);
+         }
+       return cuda_res;
+    }
+    else
+#endif
      return MPIR_Allgatherv(sendbuf, sendcount, sendtype,
 			   recvbuf, recvcounts, displs, recvtype,
                           comm_ptr, mpierrno);
diff --git a/src/mpid/pamid/src/coll/allreduce/mpido_allreduce.c b/src/mpid/pamid/src/coll/allreduce/mpido_allreduce.c
index 75d2a9c..d720279 100644
--- a/src/mpid/pamid/src/coll/allreduce/mpido_allreduce.c
+++ b/src/mpid/pamid/src/coll/allreduce/mpido_allreduce.c
@@ -118,17 +118,39 @@ int MPIDO_Allreduce(const void *sendbuf,
               selected_type);
     MPIDI_Update_last_algorithm(comm_ptr, "ALLREDUCE_MPICH");
 #if CUDA_AWARE_SUPPORT
-    if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(sendbuf))
+    if(MPIDI_Process.cuda_aware_support_on)
     {
        MPI_Aint dt_extent;
        MPID_Datatype_get_extent_macro(dt, dt_extent);
-       char *buf =  MPIU_Malloc(dt_extent * count);
-       cudaError_t cudaerr = cudaMemcpy(buf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
-       if (cudaSuccess != cudaerr) {
-         fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaerr));
+       char *scbuf = NULL;
+       char *rcbuf = NULL;
+       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
+       int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
+       if(is_send_dev_buf)
+       {
+         scbuf = MPIU_Malloc(dt_extent * count);
+         cudaError_t cudaerr = cudaMemcpy(scbuf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
+         if (cudaSuccess != cudaerr) 
+           fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+       }
+       else
+         scbuf = sendbuf;
+       if(is_recv_dev_buf)
+       {
+         rcbuf = MPIU_Malloc(dt_extent * count);
+         memset(rcbuf, 0, dt_extent * count);
        }
-       int cuda_res =  MPIR_Allreduce(buf, recvbuf, count, dt, op, comm_ptr, mpierrno);
-       MPIU_Free(buf);
+       else
+         rcbuf = recvbuf;
+       int cuda_res =  MPIR_Allreduce(scbuf, rcbuf, count, dt, op, comm_ptr, mpierrno);
+       if(is_send_dev_buf)MPIU_Free(scbuf);
+       if(is_recv_dev_buf)
+         {
+           cudaError_t cudaerr = cudaMemcpy(recvbuf, rcbuf, dt_extent * count, cudaMemcpyHostToDevice);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+           MPIU_Free(rcbuf);
+         }
        return cuda_res;
     }
     else 
diff --git a/src/mpid/pamid/src/coll/alltoall/mpido_alltoall.c b/src/mpid/pamid/src/coll/alltoall/mpido_alltoall.c
index 61644f5..42b34d1 100644
--- a/src/mpid/pamid/src/coll/alltoall/mpido_alltoall.c
+++ b/src/mpid/pamid/src/coll/alltoall/mpido_alltoall.c
@@ -91,6 +91,45 @@ int MPIDO_Alltoall(const void *sendbuf,
    {
       if(unlikely(verbose))
          fprintf(stderr,"Using MPICH alltoall algorithm\n");
+#if CUDA_AWARE_SUPPORT
+    if(MPIDI_Process.cuda_aware_support_on)
+    {
+       MPI_Aint sdt_extent,rdt_extent;
+       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
+       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
+       char *scbuf = NULL;
+       char *rcbuf = NULL;
+       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
+       int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
+       if(is_send_dev_buf)
+       {
+         scbuf = MPIU_Malloc(sdt_extent * sendcount);
+         cudaError_t cudaerr = cudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost);
+         if (cudaSuccess != cudaerr)
+           fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+       }
+       else
+         scbuf = sendbuf;
+       if(is_recv_dev_buf)
+       {
+         rcbuf = MPIU_Malloc(recvcount * rdt_extent);
+         memset(rcbuf, 0, recvcount * rdt_extent);
+       }
+       else
+         rcbuf = recvbuf;
+       int cuda_res =  MPIR_Alltoall_intra(scbuf, sendcount, sendtype, rcbuf, recvcount, recvtype, comm_ptr, mpierrno);
+       if(is_send_dev_buf)MPIU_Free(scbuf);
+       if(is_recv_dev_buf)
+         {
+           cudaError_t cudaerr = cudaMemcpy(recvbuf, rcbuf, recvcount * rdt_extent, cudaMemcpyHostToDevice);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+           MPIU_Free(rcbuf);
+         }
+       return cuda_res;
+    }
+    else
+#endif
       return MPIR_Alltoall_intra(sendbuf, sendcount, sendtype,
                       recvbuf, recvcount, recvtype,
                       comm_ptr, mpierrno);
diff --git a/src/mpid/pamid/src/coll/gather/mpido_gather.c b/src/mpid/pamid/src/coll/gather/mpido_gather.c
index 302cbe5..8676f02 100644
--- a/src/mpid/pamid/src/coll/gather/mpido_gather.c
+++ b/src/mpid/pamid/src/coll/gather/mpido_gather.c
@@ -191,6 +191,45 @@ int MPIDO_Gather(const void *sendbuf,
     MPIDI_Update_last_algorithm(comm_ptr, "GATHER_MPICH");
     if(unlikely(verbose))
       fprintf(stderr,"Using MPICH gather algorithm (01) opt %x, selected type %d\n",mpid->optgather,selected_type);
+#if CUDA_AWARE_SUPPORT
+    if(MPIDI_Process.cuda_aware_support_on)
+    {
+       MPI_Aint sdt_extent,rdt_extent;
+       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
+       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
+       char *scbuf = NULL;
+       char *rcbuf = NULL;
+       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
+       int is_recv_dev_buf = (rank == root) ? MPIDI_cuda_is_device_buf(recvbuf) : 0;
+       if(is_send_dev_buf)
+       {
+         scbuf = MPIU_Malloc(sdt_extent * sendcount);
+         cudaError_t cudaerr = cudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost);
+         if (cudaSuccess != cudaerr)
+           fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+       }
+       else
+         scbuf = sendbuf;
+       if(is_recv_dev_buf)
+       {
+         rcbuf = MPIU_Malloc(rdt_extent * recvcount);
+         memset(rcbuf, 0, rdt_extent * recvcount);
+       }
+       else
+         rcbuf = recvbuf;
+       int cuda_res =  MPIR_Gather(scbuf, sendcount, sendtype, rcbuf, recvcount, recvtype, root, comm_ptr, mpierrno);
+       if(is_send_dev_buf)MPIU_Free(scbuf);
+       if(is_recv_dev_buf)
+         {
+           cudaError_t cudaerr = cudaMemcpy(recvbuf, rcbuf, rdt_extent * recvcount, cudaMemcpyHostToDevice);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+           MPIU_Free(rcbuf);
+         }
+       return cuda_res;
+    }
+    else
+#endif
     return MPIR_Gather(sendbuf, sendcount, sendtype,
                        recvbuf, recvcount, recvtype,
                        root, comm_ptr, mpierrno);
diff --git a/src/mpid/pamid/src/coll/gatherv/mpido_gatherv.c b/src/mpid/pamid/src/coll/gatherv/mpido_gatherv.c
index 7a07544..b906c5e 100644
--- a/src/mpid/pamid/src/coll/gatherv/mpido_gatherv.c
+++ b/src/mpid/pamid/src/coll/gatherv/mpido_gatherv.c
@@ -50,6 +50,7 @@ int MPIDO_Gatherv(const void *sendbuf,
   }
 #endif
    TRACE_ERR("Entering MPIDO_Gatherv\n");
+   int i;
    int contig ATTRIBUTE((unused)), rsize ATTRIBUTE((unused)), ssize ATTRIBUTE((unused));
    int pamidt = 1;
    MPID_Datatype *dt_ptr = NULL;
@@ -59,6 +60,7 @@ int MPIDO_Gatherv(const void *sendbuf,
    int tmp;
    volatile unsigned gatherv_active = 1;
    const int rank = comm_ptr->rank;
+   const int size = comm_ptr->local_size;
 #if ASSERT_LEVEL==0
    /* We can't afford the tracing in ndebug/performance libraries */
     const unsigned verbose = 0;
@@ -82,6 +84,58 @@ int MPIDO_Gatherv(const void *sendbuf,
          fprintf(stderr,"Using MPICH gatherv algorithm\n");
       TRACE_ERR("GATHERV using MPICH\n");
       MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH");
+#if CUDA_AWARE_SUPPORT
+    if(MPIDI_Process.cuda_aware_support_on)
+    {
+       MPI_Aint sdt_extent,rdt_extent;
+       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
+       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
+       char *scbuf = NULL;
+       char *rcbuf = NULL;
+       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
+       int is_recv_dev_buf = (rank == root) ? MPIDI_cuda_is_device_buf(recvbuf) : 0;
+       if(is_send_dev_buf)
+       {
+         scbuf = MPIU_Malloc(sdt_extent * sendcount);
+         cudaError_t cudaerr = cudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost);
+         if (cudaSuccess != cudaerr)
+           fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+       }
+       else
+         scbuf = sendbuf;
+       size_t rtotal_buf;
+       if(is_recv_dev_buf)
+       {
+         //Since displs can be non-continous, we need to calculate max buffer size 
+         int highest_displs = displs[size - 1];
+         int highest_recvcount = recvcounts[size - 1];
+         for(i = 0; i < size; i++)
+         {
+           if(displs[i]+recvcounts[i] > highest_displs+highest_recvcount)
+           {
+             highest_displs = displs[i];
+             highest_recvcount = recvcounts[i];
+           }
+         }
+         rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent;
+         rcbuf = MPIU_Malloc(rtotal_buf);
+         memset(rcbuf, 0, rtotal_buf);
+       }
+       else
+         rcbuf = recvbuf;
+       int cuda_res =  MPIR_Gatherv(scbuf, sendcount, sendtype, rcbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno);
+       if(is_send_dev_buf)MPIU_Free(scbuf);
+       if(is_recv_dev_buf)
+         {
+           cudaError_t cudaerr = cudaMemcpy(recvbuf, rcbuf, rtotal_buf, cudaMemcpyHostToDevice);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+           MPIU_Free(rcbuf);
+         }
+       return cuda_res;
+    }
+    else
+#endif
       return MPIR_Gatherv(sendbuf, sendcount, sendtype,
                recvbuf, recvcounts, displs, recvtype,
                root, comm_ptr, mpierrno);
diff --git a/src/mpid/pamid/src/coll/red_scat/Makefile.mk b/src/mpid/pamid/src/coll/red_scat/Makefile.mk
new file mode 100644
index 0000000..a95beaf
--- /dev/null
+++ b/src/mpid/pamid/src/coll/red_scat/Makefile.mk
@@ -0,0 +1,29 @@
+# begin_generated_IBM_copyright_prolog                             
+#                                                                  
+# This is an automatically generated copyright prolog.             
+# After initializing,  DO NOT MODIFY OR MOVE                       
+#  --------------------------------------------------------------- 
+# Licensed Materials - Property of IBM                             
+# Blue Gene/Q 5765-PER 5765-PRP                                    
+#                                                                  
+# (C) Copyright IBM Corp. 2011, 2012 All Rights Reserved           
+# US Government Users Restricted Rights -                          
+# Use, duplication, or disclosure restricted                       
+# by GSA ADP Schedule Contract with IBM Corp.                      
+#                                                                  
+#  --------------------------------------------------------------- 
+#                                                                  
+# end_generated_IBM_copyright_prolog                               
+# -*- mode: makefile-gmake; -*-
+
+# note that the includes always happen but the effects of their contents are
+# affected by "if BUILD_PAMID"
+if BUILD_PAMID
+
+
+mpi_core_sources +=                                    \
+    src/mpid/pamid/src/coll/red_scat/mpido_red_scat.c
+
+
+endif BUILD_PAMID
+
diff --git a/src/mpid/pamid/src/coll/red_scat/mpido_red_scat.c b/src/mpid/pamid/src/coll/red_scat/mpido_red_scat.c
new file mode 100644
index 0000000..d80bbda
--- /dev/null
+++ b/src/mpid/pamid/src/coll/red_scat/mpido_red_scat.c
@@ -0,0 +1,171 @@
+/* begin_generated_IBM_copyright_prolog                             */
+/*                                                                  */
+/* This is an automatically generated copyright prolog.             */
+/* After initializing,  DO NOT MODIFY OR MOVE                       */
+/*  --------------------------------------------------------------- */
+/* Licensed Materials - Property of IBM                             */
+/* Blue Gene/Q 5765-PER 5765-PRP                                    */
+/*                                                                  */
+/* (C) Copyright IBM Corp. 2011, 2012 All Rights Reserved           */
+/* US Government Users Restricted Rights -                          */
+/* Use, duplication, or disclosure restricted                       */
+/* by GSA ADP Schedule Contract with IBM Corp.                      */
+/*                                                                  */
+/*  --------------------------------------------------------------- */
+/*                                                                  */
+/* end_generated_IBM_copyright_prolog                               */
+/*  (C)Copyright IBM Corp.  2007, 2011  */
+/**
+ * \file src/coll/red_scat/mpido_red_scat.c
+ * \brief ???
+ */
+
+/* #define TRACE_ON */
+#include <mpidimpl.h>
+
+
+int MPIDO_Reduce_scatter(const void *sendbuf, 
+                 void *recvbuf, 
+                 int *recvcounts, 
+                 MPI_Datatype datatype,
+                 MPI_Op op,
+                 MPID_Comm *comm_ptr, 
+                 int *mpierrno)
+
+{
+    const int rank = comm_ptr->rank;
+    const int size = comm_ptr->local_size;
+#if ASSERT_LEVEL==0
+   /* We can't afford the tracing in ndebug/performance libraries */
+    const unsigned verbose = 0;
+#else
+    const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
+#endif
+
+    if(unlikely(verbose))
+       fprintf(stderr,"Using MPICH reduce_scatter algorithm\n");
+#if CUDA_AWARE_SUPPORT
+    if(MPIDI_Process.cuda_aware_support_on)
+    {
+       MPI_Aint dt_extent;
+       MPID_Datatype_get_extent_macro(datatype, dt_extent);
+       char *scbuf = NULL;
+       char *rcbuf = NULL;
+       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
+       int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
+       int i;
+       size_t total_buf = 0;
+       for(i = 0; i < size; i++)
+       {
+         total_buf += recvcounts[i];
+       }
+
+       if(is_send_dev_buf)
+       {
+         scbuf = MPIU_Malloc(dt_extent * total_buf);
+         cudaError_t cudaerr = cudaMemcpy(scbuf, sendbuf, dt_extent * total_buf, cudaMemcpyDeviceToHost);
+         if (cudaSuccess != cudaerr) 
+           fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+       }
+       else
+         scbuf = sendbuf;
+
+       if(is_recv_dev_buf)
+       {
+         rcbuf = MPIU_Malloc(total_buf * dt_extent);
+         memset(rcbuf, 0, total_buf * dt_extent);
+       }
+       else
+         rcbuf = recvbuf;
+
+       int cuda_res =  MPIR_Reduce_scatter(scbuf, rcbuf, recvcounts, datatype, op, comm_ptr, mpierrno);
+       if(is_send_dev_buf)MPIU_Free(scbuf);
+       if(is_recv_dev_buf)
+       {
+         cudaError_t cudaerr = cudaMemcpy(recvbuf, rcbuf, dt_extent * total_buf, cudaMemcpyHostToDevice);
+         if (cudaSuccess != cudaerr)
+           fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+         MPIU_Free(rcbuf);
+       }
+       return cuda_res;
+    }
+    else
+#endif
+    return MPIR_Reduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op, comm_ptr, mpierrno);
+
+}
+
+
+
+int MPIDO_Reduce_scatter_block(const void *sendbuf, 
+                 void *recvbuf, 
+                 int recvcount, 
+                 MPI_Datatype datatype,
+                 MPI_Op op,
+                 MPID_Comm *comm_ptr, 
+                 int *mpierrno)
+
+{
+    const int rank = comm_ptr->rank;
+    const int size = comm_ptr->local_size;
+#if ASSERT_LEVEL==0
+   /* We can't afford the tracing in ndebug/performance libraries */
+    const unsigned verbose = 0;
+#else
+    const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
+#endif
+    if(unlikely(verbose))
+       fprintf(stderr,"Using MPICH reduce_scatter algorithm\n");
+#if CUDA_AWARE_SUPPORT
+    if(MPIDI_Process.cuda_aware_support_on)
+    {
+       MPI_Aint dt_extent;
+       MPID_Datatype_get_extent_macro(datatype, dt_extent);
+       char *scbuf = NULL;
+       char *rcbuf = NULL;
+       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
+       int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
+       int i;
+       if(is_send_dev_buf)
+       {
+         scbuf = MPIU_Malloc(dt_extent * recvcount * size);
+         cudaError_t cudaerr = cudaMemcpy(scbuf, sendbuf, dt_extent * recvcount * size, cudaMemcpyDeviceToHost);
+         if (cudaSuccess != cudaerr) 
+           fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+       }
+       else
+         scbuf = sendbuf;
+
+       if(is_recv_dev_buf)
+       {
+         rcbuf = MPIU_Malloc(dt_extent * recvcount * size);
+         memset(rcbuf, 0, dt_extent * recvcount * size);
+       }
+       else
+         rcbuf = recvbuf;
+
+       int cuda_res;
+       if(comm_ptr->comm_kind == MPID_INTRACOMM)
+         cuda_res =  MPIR_Reduce_scatter_block_intra(scbuf, rcbuf, recvcount, datatype, op, comm_ptr, mpierrno);
+       else 
+         cuda_res =  MPIR_Reduce_scatter_block_inter(scbuf, rcbuf, recvcount, datatype, op, comm_ptr, mpierrno);
+       if(is_send_dev_buf)MPIU_Free(scbuf);
+       if(is_recv_dev_buf)
+       {
+         cudaError_t cudaerr = cudaMemcpy(recvbuf, rcbuf, dt_extent * recvcount * size, cudaMemcpyHostToDevice);
+         if (cudaSuccess != cudaerr)
+           fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+         MPIU_Free(rcbuf);
+       }
+       return cuda_res;
+    }
+    else
+#endif
+       if(comm_ptr->comm_kind == MPID_INTRACOMM)
+         return MPIR_Reduce_scatter_block_intra(sendbuf, recvbuf, recvcount, datatype, op, comm_ptr, mpierrno);
+       else 
+         return MPIR_Reduce_scatter_block_inter(sendbuf, recvbuf, recvcount, datatype, op, comm_ptr, mpierrno);
+
+}
+
+
diff --git a/src/mpid/pamid/src/coll/reduce/mpido_reduce.c b/src/mpid/pamid/src/coll/reduce/mpido_reduce.c
index 2aeab7b..8e83765 100644
--- a/src/mpid/pamid/src/coll/reduce/mpido_reduce.c
+++ b/src/mpid/pamid/src/coll/reduce/mpido_reduce.c
@@ -118,17 +118,39 @@ int MPIDO_Reduce(const void *sendbuf,
       if(unlikely(verbose))
          fprintf(stderr,"Using MPICH reduce algorithm\n");
 #if CUDA_AWARE_SUPPORT
-      if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(sendbuf))
+      if(MPIDI_Process.cuda_aware_support_on)
       {
          MPI_Aint dt_extent;
          MPID_Datatype_get_extent_macro(datatype, dt_extent);
-         char *buf =  MPIU_Malloc(dt_extent * count);
-         cudaError_t cudaerr = cudaMemcpy(buf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
-         if (cudaSuccess != cudaerr) {
-           fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaerr));
+         char *scbuf = NULL;
+         char *rcbuf = NULL;
+         int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
+         int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
+         if(is_send_dev_buf)
+         {
+           scbuf = MPIU_Malloc(dt_extent * count);
+           cudaError_t cudaerr = cudaMemcpy(scbuf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
+           if (cudaSuccess != cudaerr) 
+             fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+         }
+         else
+           scbuf = sendbuf;
+         if(is_recv_dev_buf)
+         {
+           rcbuf = MPIU_Malloc(dt_extent * count);
+           memset(rcbuf, 0, dt_extent * count);
+         }
+         else
+           rcbuf = recvbuf;
+         int cuda_res =  MPIR_Reduce(scbuf, rcbuf, count, datatype, op, root, comm_ptr, mpierrno);
+         if(is_send_dev_buf)MPIU_Free(scbuf);
+         if(is_recv_dev_buf)
+         {
+           cudaError_t cudaerr = cudaMemcpy(recvbuf, rcbuf, dt_extent * count, cudaMemcpyHostToDevice);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+           MPIU_Free(rcbuf);
          }
-         int cuda_res =  MPIR_Reduce(buf, recvbuf, count, datatype, op, root, comm_ptr, mpierrno); 
-         MPIU_Free(buf);
          return cuda_res;
       }
       else
diff --git a/src/mpid/pamid/src/coll/scan/mpido_scan.c b/src/mpid/pamid/src/coll/scan/mpido_scan.c
index c0100c7..8bdce61 100644
--- a/src/mpid/pamid/src/coll/scan/mpido_scan.c
+++ b/src/mpid/pamid/src/coll/scan/mpido_scan.c
@@ -136,21 +136,43 @@ int MPIDO_Doscan(const void *sendbuf, void *recvbuf,
       if(unlikely(verbose))
          fprintf(stderr,"Using MPICH scan algorithm (exflag %d)\n",exflag);
 #if CUDA_AWARE_SUPPORT
-      if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(sendbuf))
+      if(MPIDI_Process.cuda_aware_support_on)
       {
          MPI_Aint dt_extent;
          MPID_Datatype_get_extent_macro(datatype, dt_extent);
-         char *buf =  MPIU_Malloc(dt_extent * count);
-         cudaError_t cudaerr = cudaMemcpy(buf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
-         if (cudaSuccess != cudaerr) {
-           fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaerr));
+         char *scbuf = NULL;
+         char *rcbuf = NULL;
+         int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
+         int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
+         if(is_send_dev_buf)
+         {
+           scbuf = MPIU_Malloc(dt_extent * count);
+           cudaError_t cudaerr = cudaMemcpy(scbuf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost);
+           if (cudaSuccess != cudaerr) 
+             fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+         }
+         else
+           scbuf = sendbuf;
+         if(is_recv_dev_buf)
+         {
+           rcbuf = MPIU_Malloc(dt_extent * count);
+           memset(rcbuf, 0, dt_extent * count);
          }
+         else
+           rcbuf = recvbuf;
          int cuda_res;
-         if(exflag)
-           cuda_res =  MPIR_Exscan(buf, recvbuf, count, datatype, op, comm_ptr, mpierrno);
+        if(exflag)
+           cuda_res =  MPIR_Exscan(scbuf, rcbuf, count, datatype, op, comm_ptr, mpierrno);
          else
-           cuda_res =  MPIR_Scan(buf, recvbuf, count, datatype, op, comm_ptr, mpierrno);
-         MPIU_Free(buf);
+           cuda_res =  MPIR_Scan(scbuf, rcbuf, count, datatype, op, comm_ptr, mpierrno);
+         if(is_send_dev_buf)MPIU_Free(scbuf);
+         if(is_recv_dev_buf)
+         {
+           cudaError_t cudaerr = cudaMemcpy(recvbuf, rcbuf, dt_extent * count, cudaMemcpyHostToDevice);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+           MPIU_Free(rcbuf);
+         }
          return cuda_res;
       }
       else
diff --git a/src/mpid/pamid/src/coll/scatter/mpido_scatter.c b/src/mpid/pamid/src/coll/scatter/mpido_scatter.c
index 4e76fad..0fcf07c 100644
--- a/src/mpid/pamid/src/coll/scatter/mpido_scatter.c
+++ b/src/mpid/pamid/src/coll/scatter/mpido_scatter.c
@@ -139,6 +139,45 @@ int MPIDO_Scatter(const void *sendbuf,
     if(unlikely(verbose))
       fprintf(stderr,"Using MPICH scatter algorithm\n");
     MPIDI_Update_last_algorithm(comm_ptr, "SCATTER_MPICH");
+#if CUDA_AWARE_SUPPORT
+    if(MPIDI_Process.cuda_aware_support_on)
+    {
+       MPI_Aint sdt_extent,rdt_extent;
+       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
+       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
+       char *scbuf = NULL;
+       char *rcbuf = NULL;
+       int is_send_dev_buf = (rank == root) ? MPIDI_cuda_is_device_buf(sendbuf) : 0;
+       int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
+       if(is_send_dev_buf)
+       {
+         scbuf = MPIU_Malloc(sdt_extent * sendcount);
+         cudaError_t cudaerr = cudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost);
+         if (cudaSuccess != cudaerr)
+           fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+       }
+       else
+         scbuf = sendbuf;
+       if(is_recv_dev_buf)
+       {
+         rcbuf = MPIU_Malloc(rdt_extent * recvcount);
+         memset(rcbuf, 0, rdt_extent * recvcount);
+       }
+       else
+         rcbuf = recvbuf;
+       int cuda_res =  MPIR_Scatter(scbuf, sendcount, sendtype, rcbuf, recvcount, recvtype, root, comm_ptr, mpierrno);
+       if(is_send_dev_buf)MPIU_Free(scbuf);
+       if(is_recv_dev_buf)
+         {
+           cudaError_t cudaerr = cudaMemcpy(recvbuf, rcbuf, rdt_extent * recvcount, cudaMemcpyHostToDevice);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+           MPIU_Free(rcbuf);
+         }
+       return cuda_res;
+    }
+    else
+#endif
     return MPIR_Scatter(sendbuf, sendcount, sendtype,
                         recvbuf, recvcount, recvtype,
                         root, comm_ptr, mpierrno);
diff --git a/src/mpid/pamid/src/coll/scatterv/mpido_scatterv.c b/src/mpid/pamid/src/coll/scatterv/mpido_scatterv.c
index b3767d0..5dad933 100644
--- a/src/mpid/pamid/src/coll/scatterv/mpido_scatterv.c
+++ b/src/mpid/pamid/src/coll/scatterv/mpido_scatterv.c
@@ -222,7 +222,7 @@ int MPIDO_Scatterv(const void *sendbuf,
     return -1;
   }
 #endif
-  int tmp, pamidt = 1;
+  int tmp, i, pamidt = 1;
   int contig ATTRIBUTE((unused));
   int ssize ATTRIBUTE((unused));
   int rsize ATTRIBUTE((unused));
@@ -231,6 +231,7 @@ int MPIDO_Scatterv(const void *sendbuf,
   char *sbuf, *rbuf;
   pami_type_t stype, rtype;
   const int rank = comm_ptr->rank;
+  const int size = comm_ptr->local_size;
 #if ASSERT_LEVEL==0
    /* We can't afford the tracing in ndebug/performance libraries */
     const unsigned verbose = 0;
@@ -245,6 +246,58 @@ int MPIDO_Scatterv(const void *sendbuf,
     if(unlikely(verbose))
       fprintf(stderr,"Using MPICH scatterv algorithm\n");
     MPIDI_Update_last_algorithm(comm_ptr, "SCATTERV_MPICH");
+#if CUDA_AWARE_SUPPORT
+    if(MPIDI_Process.cuda_aware_support_on)
+    {
+       MPI_Aint sdt_extent,rdt_extent;
+       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
+       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
+       char *scbuf = NULL;
+       char *rcbuf = NULL;
+       int is_send_dev_buf = (rank == root) ? MPIDI_cuda_is_device_buf(sendbuf) : 0;
+       int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf);
+       if(is_send_dev_buf)
+       {
+         //Since displs can be non-continous, we need to calculate max buffer size 
+         int highest_displs = displs[size - 1];
+         int highest_sendcount = sendcounts[size - 1];
+         size_t stotal_buf;
+         for(i = 0; i < size; i++)
+         {
+           if(displs[i]+sendcounts[i] > highest_displs+highest_sendcount)
+           {
+             highest_displs = displs[i];
+             highest_sendcount = sendcounts[i];
+           }
+         }
+         stotal_buf = (highest_displs+highest_sendcount)*sdt_extent;
+         scbuf = MPIU_Malloc(stotal_buf);
+         cudaError_t cudaerr = cudaMemcpy(scbuf, sendbuf, stotal_buf, cudaMemcpyDeviceToHost);
+         if (cudaSuccess != cudaerr)
+           fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+       }
+       else
+         scbuf = sendbuf;
+       if(is_recv_dev_buf)
+       {
+         rcbuf = MPIU_Malloc(recvcount * rdt_extent);
+         memset(rcbuf, 0, recvcount * rdt_extent);
+       }
+       else
+         rcbuf = recvbuf;
+       int cuda_res =  MPIR_Scatterv(scbuf, sendcounts, displs, sendtype, rcbuf, recvcount, recvtype, root, comm_ptr, mpierrno);
+       if(is_send_dev_buf)MPIU_Free(scbuf);
+       if(is_recv_dev_buf)
+         {
+           cudaError_t cudaerr = cudaMemcpy(recvbuf, rcbuf, recvcount * rdt_extent, cudaMemcpyHostToDevice);
+           if (cudaSuccess != cudaerr)
+             fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaerr));
+           MPIU_Free(rcbuf);
+         }
+       return cuda_res;
+    }
+    else
+#endif
     return MPIR_Scatterv(sendbuf, sendcounts, displs, sendtype,
                          recvbuf, recvcount, recvtype,
                          root, comm_ptr, mpierrno);
diff --git a/src/mpid/pamid/src/comm/mpid_selectcolls.c b/src/mpid/pamid/src/comm/mpid_selectcolls.c
index a8c3f81..2efff76 100644
--- a/src/mpid/pamid/src/comm/mpid_selectcolls.c
+++ b/src/mpid/pamid/src/comm/mpid_selectcolls.c
@@ -690,6 +690,8 @@ void MPIDI_Comm_coll_query(MPID_Comm *comm)
    comm->coll_fns->Reduce       = MPIDO_Reduce;
    comm->coll_fns->Scan         = MPIDO_Scan;
    comm->coll_fns->Exscan       = MPIDO_Exscan;
+   comm->coll_fns->Reduce_scatter_block = MPIDO_Reduce_scatter_block;
+   comm->coll_fns->Reduce_scatter = MPIDO_Reduce_scatter;
 
    /* MPI-3 Support, no optimized collectives hooked in yet */
    comm->coll_fns->Ibarrier_sched              = MPIR_Ibarrier_intra;

-----------------------------------------------------------------------

Summary of changes:
 src/mpid/pamid/include/mpidi_prototypes.h          |    6 +
 src/mpid/pamid/src/coll/Makefile.mk                |    1 +
 .../pamid/src/coll/allgather/mpido_allgather.c     |   39 +++++
 .../pamid/src/coll/allgatherv/mpido_allgatherv.c   |   52 ++++++
 .../pamid/src/coll/allreduce/mpido_allreduce.c     |   36 ++++-
 src/mpid/pamid/src/coll/alltoall/mpido_alltoall.c  |   39 +++++
 src/mpid/pamid/src/coll/gather/mpido_gather.c      |   39 +++++
 src/mpid/pamid/src/coll/gatherv/mpido_gatherv.c    |   54 ++++++
 .../pamid/src/{mpix => coll/red_scat}/Makefile.mk  |    2 +-
 src/mpid/pamid/src/coll/red_scat/mpido_red_scat.c  |  171 ++++++++++++++++++++
 src/mpid/pamid/src/coll/reduce/mpido_reduce.c      |   36 ++++-
 src/mpid/pamid/src/coll/scan/mpido_scan.c          |   40 ++++-
 src/mpid/pamid/src/coll/scatter/mpido_scatter.c    |   39 +++++
 src/mpid/pamid/src/coll/scatterv/mpido_scatterv.c  |   55 ++++++-
 src/mpid/pamid/src/comm/mpid_selectcolls.c         |    2 +
 15 files changed, 586 insertions(+), 25 deletions(-)
 copy src/mpid/pamid/src/{mpix => coll/red_scat}/Makefile.mk (96%)
 create mode 100644 src/mpid/pamid/src/coll/red_scat/mpido_red_scat.c


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list