[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2b3-160-gebc70f3

Service Account noreply at mpich.org
Fri Jun 26 16:25:00 CDT 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  ebc70f3db40470a5c2f914103e779e4653966fee (commit)
       via  f49534e19e7fa40956e45fdcbd4933d66563332e (commit)
       via  5f54498761a43df407aa4e9bfff9f32c6a47960e (commit)
       via  c377f28d4a3b1c99ccc746534f8aa811cd251424 (commit)
       via  b30fe09ac495f92a9fe6a745db809251a87d608a (commit)
       via  390c449c8fb67f96bca2bdd61b80c8ef6fe01901 (commit)
       via  9ad7292426d5d91b453813f758ac6bd6c4968caf (commit)
       via  638ad7785e4fa969e47223e60f6bb920d6c60467 (commit)
       via  14cf7757372fae40c210f8831909d75b799781b5 (commit)
       via  8a6dab58fe8572176004642d8e52661bd7eaf1a1 (commit)
       via  d6f65c81786a448476069d47ea350d654f51d815 (commit)
       via  9b1b9241d063901710a465467b36878f284aa53b (commit)
       via  93a25439016411d1bd9a0cf473beeb897a40420c (commit)
       via  a9849cd112376455d2459ac08d4dcc7dace043f4 (commit)
       via  10e3c6447fff99259854b93b294fef4ad4a1e731 (commit)
       via  7c39124f49b33c88557ab8e6a6a4bb5fcdf8ce37 (commit)
       via  fb72c9ce66ad60e0dc419390e215eab11b891b2c (commit)
       via  3fdf2c073255c55d09e914e2aebca08e4be88e31 (commit)
       via  ede41471c48cd9b5cd8f30e875b507503a090731 (commit)
       via  9042b82822947297c021f281e10b394a265c1bf0 (commit)
       via  97569a1ac64cbc23bcae3e3440f3dfaf58d61b83 (commit)
       via  bf7f2f7f67408098f74a411aee3aeb76acde5f04 (commit)
       via  e9891eaeaa35d5ab5e5f407fcfbc11a3348e61cf (commit)
       via  06dbf44b8fe8cc313b5bab53e2cfd59b8085f718 (commit)
       via  139d85d502f89d81de4d88632b2cb315fc600769 (commit)
       via  03b4a2034195c1d47be1b29a55513742915e763a (commit)
      from  8b0b0a0f8371f39014b5b5a8e14db334f33bba46 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/ebc70f3db40470a5c2f914103e779e4653966fee

commit ebc70f3db40470a5c2f914103e779e4653966fee
Author: Min Si <msi at il.is.s.u-tokyo.ac.jp>
Date:   Thu Jun 25 17:13:04 2015 -0500

    Redefined RMA extended packet header in CH3 layer.
    
    This patch redefined RMA extended header in CH3 layer based on commit
    25e40e4. The extended header helps RMA issue RMA-specific header. Each
    OP may define multiple attributes dynamically, or just let it empty.
    Here is a summary of the detailed implementation.
    
    (1) We define a packet type for every kind of extended header, and for
    every kind of OP. For now, we have defined stream_{acc|get_acc},
    derived_{put|get|acc|get_acc} and stream_derived_{acc|get_acc}.
    
    (2) Extended header may contain fixed-attributes, or variable-length
    parts (i.e., dataloop). We define all fixed-attributes in packet
    structure, and followed by variable-length parts.
    For example:
    -------------------------------------------------------------------
    | fixed attributes... | variable-len part 1 | variable-len part 2 |
    -------------------------------------------------------------------
    
    (3) Origin process simply allocates a contig buffer to fill both fixed part
    and variable-len parts, and transfer it to netmod through req; target
    process can specify separate buffers to receive the variable-len parts
    from netmod in order to avoid extra copy (i.e., dataloop).
    
    (4) Each OP has different initialization and packet-handler on origin
    and target side respectively. For now ACC and GET_ACC share generic
    routine since all of their attributes are the same.
    
    Signed-off-by: Xin Zhao <xinzhao3 at illinois.edu>
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/include/mpid_nem_inline.h b/src/mpid/ch3/channels/nemesis/include/mpid_nem_inline.h
index 15188ca..a4f0109 100644
--- a/src/mpid/ch3/channels/nemesis/include/mpid_nem_inline.h
+++ b/src/mpid/ch3/channels/nemesis/include/mpid_nem_inline.h
@@ -353,6 +353,9 @@ MPID_nem_mpich_sendv_header (MPID_IOV **iov, int *n_iov,
     buf_offset += sizeof(MPIDI_CH3_Pkt_t);
 
     if (ext_hdr_sz > 0) {
+        /* ensure extended header fits in this cell. */
+        MPIU_Assert(MPID_NEM_MPICH_DATA_LEN - buf_offset >= ext_hdr_sz);
+
         /* when extended packet header exists, copy it */
         MPIU_Memcpy((void *)((char *)(el->pkt.mpich.p.payload) + buf_offset), ext_hdr_ptr, ext_hdr_sz);
         buf_offset += ext_hdr_sz;
diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index 2172e7e..e0d4be6 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -54,106 +54,156 @@ static inline int immed_copy(void *src, void *dest, size_t len)
     goto fn_exit;
 }
 
+/* =========================================================== */
+/*                  extended packet functions                  */
+/* =========================================================== */
 
-/* fill_in_derived_dtp_info() fills derived datatype information
-   into RMA operation structure. */
+/* Copy derived datatype information issued within RMA operation. */
 #undef FUNCNAME
 #define FUNCNAME fill_in_derived_dtp_info
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int fill_in_derived_dtp_info(MPIDI_RMA_Op_t * rma_op, MPID_Datatype * dtp)
+static inline void fill_in_derived_dtp_info(MPIDI_RMA_dtype_info * dtype_info, void *dataloop,
+                                            MPID_Datatype * dtp)
 {
-    int mpi_errno = MPI_SUCCESS;
-    MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_FILL_IN_DERIVED_DTP_INFO);
-
     MPIDI_FUNC_ENTER(MPID_STATE_FILL_IN_DERIVED_DTP_INFO);
 
     /* Derived datatype on target, fill derived datatype info. */
-    rma_op->dtype_info.is_contig = dtp->is_contig;
-    rma_op->dtype_info.max_contig_blocks = dtp->max_contig_blocks;
-    rma_op->dtype_info.size = dtp->size;
-    rma_op->dtype_info.extent = dtp->extent;
-    rma_op->dtype_info.dataloop_size = dtp->dataloop_size;
-    rma_op->dtype_info.dataloop_depth = dtp->dataloop_depth;
-    rma_op->dtype_info.basic_type = dtp->basic_type;
-    rma_op->dtype_info.dataloop = dtp->dataloop;
-    rma_op->dtype_info.ub = dtp->ub;
-    rma_op->dtype_info.lb = dtp->lb;
-    rma_op->dtype_info.true_ub = dtp->true_ub;
-    rma_op->dtype_info.true_lb = dtp->true_lb;
-    rma_op->dtype_info.has_sticky_ub = dtp->has_sticky_ub;
-    rma_op->dtype_info.has_sticky_lb = dtp->has_sticky_lb;
-
-    MPIU_Assert(rma_op->dataloop == NULL);
-    MPIU_CHKPMEM_MALLOC(rma_op->dataloop, void *, dtp->dataloop_size, mpi_errno, "dataloop");
-
-    MPIU_Memcpy(rma_op->dataloop, dtp->dataloop, dtp->dataloop_size);
+    dtype_info->is_contig = dtp->is_contig;
+    dtype_info->max_contig_blocks = dtp->max_contig_blocks;
+    dtype_info->size = dtp->size;
+    dtype_info->extent = dtp->extent;
+    dtype_info->dataloop_size = dtp->dataloop_size;
+    dtype_info->dataloop_depth = dtp->dataloop_depth;
+    dtype_info->basic_type = dtp->basic_type;
+    dtype_info->dataloop = dtp->dataloop;
+    dtype_info->ub = dtp->ub;
+    dtype_info->lb = dtp->lb;
+    dtype_info->true_ub = dtp->true_ub;
+    dtype_info->true_lb = dtp->true_lb;
+    dtype_info->has_sticky_ub = dtp->has_sticky_ub;
+    dtype_info->has_sticky_lb = dtp->has_sticky_lb;
+
+    MPIU_Assert(dataloop != NULL);
+    MPIU_Memcpy(dataloop, dtp->dataloop, dtp->dataloop_size);
     /* The dataloop can have undefined padding sections, so we need to let
      * valgrind know that it is OK to pass this data to writev later on. */
-    MPL_VG_MAKE_MEM_DEFINED(rma_op->dataloop, dtp->dataloop_size);
+    MPL_VG_MAKE_MEM_DEFINED(dataloop, dtp->dataloop_size);
 
-  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_FILL_IN_DERIVED_DTP_INFO);
-    MPIU_CHKPMEM_COMMIT();
-    return mpi_errno;
-  fn_fail:
-    MPIU_CHKPMEM_REAP();
-    goto fn_exit;
 }
 
-
+/* Set extended header for ACC operation and return its real size. */
 #undef FUNCNAME
-#define FUNCNAME create_datatype
+#define FUNCNAME init_accum_ext_pkt
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int create_datatype(int *ints, MPI_Aint * displaces, MPI_Datatype * datatypes,
-                           MPID_Datatype ** combined_dtp)
+static int init_accum_ext_pkt(MPIDI_CH3_Pkt_flags_t flags,
+                              MPID_Datatype * target_dtp, MPIDI_msg_sz_t stream_offset,
+                              void **ext_hdr_ptr, MPI_Aint * ext_hdr_sz)
 {
+    MPI_Aint _ext_hdr_sz = 0, _total_sz = 0;
+    void *dataloop_ptr = NULL;
     int mpi_errno = MPI_SUCCESS;
-    /* datatype_set_contents wants an array 'ints' which is the
-     * blocklens array with count prepended to it.  So blocklens
-     * points to the 2nd element of ints to avoid having to copy
-     * blocklens into ints later. */
-    int *blocklens = &ints[1];
-    MPI_Datatype combined_datatype;
-    int count = ints[0];
-    MPIDI_STATE_DECL(MPID_STATE_CREATE_DATATYPE);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_CREATE_DATATYPE);
-
-    mpi_errno = MPID_Type_struct(count, blocklens, displaces, datatypes, &combined_datatype);
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
 
-    MPID_Datatype_get_ptr(combined_datatype, *combined_dtp);
-    mpi_errno = MPID_Datatype_set_contents(*combined_dtp, MPI_COMBINER_STRUCT, count + 1,       /* ints (cnt,blklen) */
-                                           count,       /* aints (disps) */
-                                           count,       /* types */
-                                           ints, displaces, datatypes);
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_INIT_ACCUM_EXT_PKT);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_INIT_ACCUM_EXT_PKT);
+
+    if ((flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) && target_dtp != NULL) {
+        MPIDI_CH3_Ext_pkt_accum_stream_derived_t *_ext_hdr_ptr = NULL;
 
-    /* Commit datatype */
+        /* dataloop is behind of extended header on origin.
+         * TODO: support extended header array */
+        _ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_accum_stream_derived_t);
+        _total_sz = _ext_hdr_sz + target_dtp->dataloop_size;
 
-    MPID_Dataloop_create(combined_datatype,
-                         &(*combined_dtp)->dataloop,
-                         &(*combined_dtp)->dataloop_size,
-                         &(*combined_dtp)->dataloop_depth, MPID_DATALOOP_HOMOGENEOUS);
+        _ext_hdr_ptr = (MPIDI_CH3_Ext_pkt_accum_stream_derived_t *) MPIU_Malloc(_total_sz);
+        if (_ext_hdr_ptr == NULL) {
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem",
+                                 "**nomem %s", "MPIDI_CH3_Ext_pkt_accum_stream_derived_t");
+        }
 
-    /* create heterogeneous dataloop */
-    MPID_Dataloop_create(combined_datatype,
-                         &(*combined_dtp)->hetero_dloop,
-                         &(*combined_dtp)->hetero_dloop_size,
-                         &(*combined_dtp)->hetero_dloop_depth, MPID_DATALOOP_HETEROGENEOUS);
+        _ext_hdr_ptr->stream_offset = stream_offset;
+
+        dataloop_ptr = (void *) ((char *) _ext_hdr_ptr + _ext_hdr_sz);
+        fill_in_derived_dtp_info(&_ext_hdr_ptr->dtype_info, dataloop_ptr, target_dtp);
+
+        (*ext_hdr_ptr) = _ext_hdr_ptr;
+    }
+    else if (flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
+        MPIDI_CH3_Ext_pkt_accum_stream_t *_ext_hdr_ptr = NULL;
+
+        _total_sz = sizeof(MPIDI_CH3_Ext_pkt_accum_stream_t);
+
+        _ext_hdr_ptr = (MPIDI_CH3_Ext_pkt_accum_stream_t *) MPIU_Malloc(_total_sz);
+        if (_ext_hdr_ptr == NULL) {
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem",
+                                 "**nomem %s", "MPIDI_CH3_Ext_pkt_accum_stream_t");
+        }
+
+        _ext_hdr_ptr->stream_offset = stream_offset;
+        (*ext_hdr_ptr) = _ext_hdr_ptr;
+    }
+    else if (target_dtp != NULL) {
+        MPIDI_CH3_Ext_pkt_accum_derived_t *_ext_hdr_ptr = NULL;
+
+        /* dataloop is behind of extended header on origin.
+         * TODO: support extended header array */
+        _ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_accum_derived_t);
+        _total_sz = _ext_hdr_sz + target_dtp->dataloop_size;
+
+        _ext_hdr_ptr = (MPIDI_CH3_Ext_pkt_accum_derived_t *) MPIU_Malloc(_total_sz);
+        if (_ext_hdr_ptr == NULL) {
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem",
+                                 "**nomem %s", "MPIDI_CH3_Ext_pkt_accum_derived_t");
+        }
+
+        dataloop_ptr = (void *) ((char *) _ext_hdr_ptr + _ext_hdr_sz);
+        fill_in_derived_dtp_info(&_ext_hdr_ptr->dtype_info, dataloop_ptr, target_dtp);
+
+        (*ext_hdr_ptr) = _ext_hdr_ptr;
+    }
+
+    (*ext_hdr_sz) = _total_sz;
 
   fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_CREATE_DATATYPE);
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_INIT_ACCUM_EXT_PKT);
     return mpi_errno;
   fn_fail:
+    if ((*ext_hdr_ptr))
+        MPIU_Free((*ext_hdr_ptr));
+    (*ext_hdr_ptr) = NULL;
+    (*ext_hdr_sz) = 0;
     goto fn_exit;
 }
 
+#undef FUNCNAME
+#define FUNCNAME init_get_accum_ext_pkt
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int init_get_accum_ext_pkt(MPIDI_CH3_Pkt_flags_t flags,
+                                  MPID_Datatype * target_dtp, MPIDI_msg_sz_t stream_offset,
+                                  void **ext_hdr_ptr, MPI_Aint * ext_hdr_sz)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_INIT_GET_ACCUM_EXT_PKT);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_INIT_GET_ACCUM_EXT_PKT);
+
+    /* Check if get_accum still reuses accum' extended packet header. */
+    MPIU_Assert(sizeof(MPIDI_CH3_Ext_pkt_accum_stream_derived_t) ==
+                sizeof(MPIDI_CH3_Ext_pkt_get_accum_stream_derived_t));
+    MPIU_Assert(sizeof(MPIDI_CH3_Ext_pkt_accum_derived_t) ==
+                sizeof(MPIDI_CH3_Ext_pkt_get_accum_derived_t));
+    MPIU_Assert(sizeof(MPIDI_CH3_Ext_pkt_accum_stream_t) ==
+                sizeof(MPIDI_CH3_Ext_pkt_get_accum_stream_t));
+
+    mpi_errno = init_accum_ext_pkt(flags, target_dtp, stream_offset, ext_hdr_ptr, ext_hdr_sz);
+
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_INIT_GET_ACCUM_EXT_PKT);
+    return mpi_errno;
+}
 
 /* =========================================================== */
 /*                      issuinng functions                     */
@@ -166,6 +216,7 @@ static int create_datatype(int *ints, MPI_Aint * displaces, MPI_Datatype * datat
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
+                                    void *ext_hdr_ptr, MPI_Aint ext_hdr_sz,
                                     MPIDI_msg_sz_t stream_offset, MPIDI_msg_sz_t stream_size,
                                     MPID_Request ** req_ptr)
 {
@@ -175,11 +226,6 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
     MPID_IOV iov[MPID_IOV_LIMIT];
     int iovcnt = 0;
     MPID_Request *req = NULL;
-    int count;
-    int *ints = NULL;
-    int *blocklens = NULL;
-    MPI_Aint *displaces = NULL;
-    MPI_Datatype *datatypes = NULL;
     MPI_Aint dt_true_lb;
     MPIDI_CH3_Pkt_flags_t flags;
     int is_empty_origin = FALSE;
@@ -202,15 +248,8 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
     if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
         MPID_Datatype_get_ptr(target_datatype, target_dtp);
 
-        if (rma_op->dataloop == NULL) {
-            /* Fill derived datatype info. */
-            mpi_errno = fill_in_derived_dtp_info(rma_op, target_dtp);
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-
-            /* Set dataloop size in pkt header */
-            MPIDI_CH3_PKT_RMA_SET_DATALOOP_SIZE(rma_op->pkt, target_dtp->dataloop_size, mpi_errno);
-        }
+        /* Set dataloop size in pkt header */
+        MPIDI_CH3_PKT_RMA_SET_DATALOOP_SIZE(rma_op->pkt, target_dtp->dataloop_size, mpi_errno);
     }
 
     if (is_empty_origin == FALSE) {
@@ -278,186 +317,43 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
     MPIU_Object_set_ref(req, 2);
     req->kind = MPID_REQUEST_SEND;
 
-    /* allocate and fill in extended packet header in the request */
-    if (flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-        MPIU_Assert(rma_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
-                    rma_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM);
-        if (rma_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE) {
-            req->dev.ext_hdr_ptr = MPIU_Malloc(sizeof(MPIDI_CH3_Ext_pkt_accum_t));
-            if (!req->dev.ext_hdr_ptr) {
-                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                     "MPIDI_CH3_Ext_pkt_accum_t");
-            }
-            req->dev.ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_accum_t);
-            ((MPIDI_CH3_Ext_pkt_accum_t *) req->dev.ext_hdr_ptr)->stream_offset = stream_offset;
-        }
-        else {
-            req->dev.ext_hdr_ptr = MPIU_Malloc(sizeof(MPIDI_CH3_Ext_pkt_get_accum_t));
-            if (!req->dev.ext_hdr_ptr) {
-                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                     "MPIDI_CH3_Ext_pkt_get_accum_t");
-            }
-            req->dev.ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_get_accum_t);
-            ((MPIDI_CH3_Ext_pkt_get_accum_t *) req->dev.ext_hdr_ptr)->stream_offset = stream_offset;
-        }
+    /* set extended packet header, it is freed when the request is freed.  */
+    if (ext_hdr_sz > 0) {
+        req->dev.ext_hdr_sz = ext_hdr_sz;
+        req->dev.ext_hdr_ptr = ext_hdr_ptr;
     }
 
-    if (target_dtp == NULL) {
-        /* basic datatype on target */
-
-        if (origin_dtp != NULL) {
-            req->dev.datatype_ptr = origin_dtp;
-            /* this will cause the datatype to be freed when the request
-             * is freed. */
-        }
-
-        if (is_origin_contig) {
-            /* origin data is contiguous */
-
-            if (is_empty_origin == FALSE) {
-                iov[iovcnt].MPID_IOV_BUF =
-                    (MPID_IOV_BUF_CAST) ((char *) rma_op->origin_addr + dt_true_lb + stream_offset);
-                iov[iovcnt].MPID_IOV_LEN = stream_size;
-                iovcnt++;
-            }
-
-            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-            mpi_errno = MPIDI_CH3_iSendv(vc, req, iov, iovcnt);
-            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-        }
-        else {
-            /* origin data is non-contiguous */
-            req->dev.segment_ptr = MPID_Segment_alloc();
-            MPIU_ERR_CHKANDJUMP1(req->dev.segment_ptr == NULL, mpi_errno,
-                                 MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
-
-            MPID_Segment_init(rma_op->origin_addr, rma_op->origin_count,
-                              rma_op->origin_datatype, req->dev.segment_ptr, 0);
-            req->dev.segment_first = stream_offset;
-            req->dev.segment_size = stream_offset + stream_size;
-
-            req->dev.OnFinal = 0;
-            req->dev.OnDataAvail = 0;
-
-            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-            mpi_errno = vc->sendNoncontig_fn(vc, req, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
-            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-        }
+    if (origin_dtp != NULL) {
+        req->dev.datatype_ptr = origin_dtp;
+        /* this will cause the datatype to be freed when the request
+         * is freed. */
     }
-    else {
-        /* derived datatype on target */
-        MPID_Datatype *combined_dtp = NULL;
-        MPID_Segment *segp = NULL;
-        DLOOP_VECTOR *dloop_vec = NULL;
-        MPID_Datatype *dtp = NULL;
-        int vec_len, i;
-        MPIDI_msg_sz_t first = stream_offset;
-        MPIDI_msg_sz_t last = stream_offset + stream_size;
-
-        /* create a new datatype containing the dtype_info, dataloop, and origin data */
-
-        if (is_empty_origin == TRUE) {
-            count = 2;
-            ints = (int *) MPIU_Malloc(sizeof(int) * (count + 1));
-            blocklens = &ints[1];
-            displaces = (MPI_Aint *) MPIU_Malloc(sizeof(MPI_Aint) * count);
-            datatypes = (MPI_Datatype *) MPIU_Malloc(sizeof(MPI_Datatype) * count);
-
-            ints[0] = count;
-
-            displaces[0] = MPIU_PtrToAint(&(rma_op->dtype_info));
-            blocklens[0] = sizeof(MPIDI_RMA_dtype_info);
-            datatypes[0] = MPI_BYTE;
-
-            displaces[1] = MPIU_PtrToAint(rma_op->dataloop);
-            MPIU_Assign_trunc(blocklens[1], target_dtp->dataloop_size, int);
-            datatypes[1] = MPI_BYTE;
-        }
-        else if (flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-            segp = MPID_Segment_alloc();
-            MPIU_ERR_CHKANDJUMP1(segp == NULL, mpi_errno, MPI_ERR_OTHER,
-                                 "**nomem", "**nomem %s", "MPID_Segment_alloc");
-
-            MPID_Segment_init(rma_op->origin_addr, rma_op->origin_count, rma_op->origin_datatype,
-                              segp, 0);
-
-            MPID_Datatype_get_ptr(rma_op->origin_datatype, dtp);
-            vec_len = dtp->max_contig_blocks * rma_op->origin_count + 1;
-            dloop_vec = (DLOOP_VECTOR *) MPIU_Malloc(vec_len * sizeof(DLOOP_VECTOR));
-            /* --BEGIN ERROR HANDLING-- */
-            if (!dloop_vec) {
-                mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
-                                                 FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0);
-                goto fn_fail;
-            }
-            /* --END ERROR HANDLING-- */
-
-            MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);
-
-            count = 2 + vec_len;
-
-            ints = (int *) MPIU_Malloc(sizeof(int) * (count + 1));
-            blocklens = &ints[1];
-            displaces = (MPI_Aint *) MPIU_Malloc(sizeof(MPI_Aint) * count);
-            datatypes = (MPI_Datatype *) MPIU_Malloc(sizeof(MPI_Datatype) * count);
-
-            ints[0] = count;
-
-            displaces[0] = MPIU_PtrToAint(&(rma_op->dtype_info));
-            blocklens[0] = sizeof(MPIDI_RMA_dtype_info);
-            datatypes[0] = MPI_BYTE;
-
-            displaces[1] = MPIU_PtrToAint(rma_op->dataloop);
-            MPIU_Assign_trunc(blocklens[1], target_dtp->dataloop_size, int);
-            datatypes[1] = MPI_BYTE;
-
-            for (i = 0; i < vec_len; i++) {
-                displaces[i + 2] = MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF);
-                MPIU_Assign_trunc(blocklens[i + 2], dloop_vec[i].DLOOP_VECTOR_LEN, int);
-                datatypes[i + 2] = MPI_BYTE;
-            }
-
-            MPID_Segment_free(segp);
-            MPIU_Free(dloop_vec);
-        }
-        else {
-            count = 3;
-            ints = (int *) MPIU_Malloc(sizeof(int) * (count + 1));
-            blocklens = &ints[1];
-            displaces = (MPI_Aint *) MPIU_Malloc(sizeof(MPI_Aint) * count);
-            datatypes = (MPI_Datatype *) MPIU_Malloc(sizeof(MPI_Datatype) * count);
 
-            ints[0] = count;
+    if (is_origin_contig) {
+        /* origin data is contiguous */
 
-            displaces[0] = MPIU_PtrToAint(&(rma_op->dtype_info));
-            blocklens[0] = sizeof(MPIDI_RMA_dtype_info);
-            datatypes[0] = MPI_BYTE;
-
-            displaces[1] = MPIU_PtrToAint(rma_op->dataloop);
-            MPIU_Assign_trunc(blocklens[1], target_dtp->dataloop_size, int);
-            datatypes[1] = MPI_BYTE;
-
-            displaces[2] = MPIU_PtrToAint(rma_op->origin_addr);
-            blocklens[2] = rma_op->origin_count;
-            datatypes[2] = rma_op->origin_datatype;
+        if (is_empty_origin == FALSE) {
+            iov[iovcnt].MPID_IOV_BUF =
+                (MPID_IOV_BUF_CAST) ((char *) rma_op->origin_addr + dt_true_lb + stream_offset);
+            iov[iovcnt].MPID_IOV_LEN = stream_size;
+            iovcnt++;
         }
 
-        mpi_errno = create_datatype(ints, displaces, datatypes, &combined_dtp);
-        if (mpi_errno)
-            MPIU_ERR_POP(mpi_errno);
-
-        req->dev.datatype_ptr = combined_dtp;
-        /* combined_datatype will be freed when request is freed */
-
+        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+        mpi_errno = MPIDI_CH3_iSendv(vc, req, iov, iovcnt);
+        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+    }
+    else {
+        /* origin data is non-contiguous */
         req->dev.segment_ptr = MPID_Segment_alloc();
-        MPIU_ERR_CHKANDJUMP1(req->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER,
-                             "**nomem", "**nomem %s", "MPID_Segment_alloc");
+        MPIU_ERR_CHKANDJUMP1(req->dev.segment_ptr == NULL, mpi_errno,
+                             MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
 
-        MPID_Segment_init(MPI_BOTTOM, 1, combined_dtp->handle, req->dev.segment_ptr, 0);
-        req->dev.segment_first = 0;
-        req->dev.segment_size = combined_dtp->size;
+        MPID_Segment_init(rma_op->origin_addr, rma_op->origin_count,
+                          rma_op->origin_datatype, req->dev.segment_ptr, 0);
+        req->dev.segment_first = stream_offset;
+        req->dev.segment_size = stream_offset + stream_size;
 
         req->dev.OnFinal = 0;
         req->dev.OnDataAvail = 0;
@@ -466,18 +362,12 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
         mpi_errno = vc->sendNoncontig_fn(vc, req, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
         MPIU_THREAD_CS_EXIT(CH3COMM, vc);
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-
-        MPIU_Free(ints);
-        MPIU_Free(displaces);
-        MPIU_Free(datatypes);
-
-        /* we're done with the datatypes */
-        if (origin_dtp != NULL)
-            MPID_Datatype_release(origin_dtp);
-        MPID_Datatype_release(target_dtp);
     }
 
   fn_exit:
+    /* release the target datatype */
+    if (target_dtp)
+        MPID_Datatype_release(target_dtp);
     (*req_ptr) = req;
 
     MPIDI_FUNC_EXIT(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER);
@@ -486,8 +376,11 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
     if (req) {
         if (req->dev.datatype_ptr)
             MPID_Datatype_release(req->dev.datatype_ptr);
+        if (req->dev.ext_hdr_ptr)
+            MPIU_Free(req->dev.ext_hdr_ptr);
         MPID_Request_release(req);
     }
+
     (*req_ptr) = NULL;
     goto fn_exit;
 }
@@ -505,6 +398,10 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_put_t *put_pkt = &rma_op->pkt.put;
     MPID_Request *curr_req = NULL;
+    MPI_Datatype target_datatype;
+    MPID_Datatype *target_dtp_ptr = NULL;
+    MPIDI_CH3_Ext_pkt_put_derived_t *ext_hdr_ptr = NULL;
+    MPI_Aint ext_hdr_sz = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_PUT_OP);
 
@@ -524,8 +421,30 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     else {
         MPI_Aint origin_type_size;
         MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
-        mpi_errno = issue_from_origin_buffer(rma_op, vc, 0,
-                                             rma_op->origin_count * origin_type_size, &curr_req);
+
+        /* If derived datatype on target, add extended packet header. */
+        MPIDI_CH3_PKT_RMA_GET_TARGET_DATATYPE(rma_op->pkt, target_datatype, mpi_errno);
+        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
+            MPID_Datatype_get_ptr(target_datatype, target_dtp_ptr);
+
+            void *dataloop_ptr = NULL;
+
+            /* dataloop is behind of extended header on origin.
+             * TODO: support extended header array */
+            ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_put_derived_t) + target_dtp_ptr->dataloop_size;
+            ext_hdr_ptr = MPIU_Malloc(ext_hdr_sz);
+            if (!ext_hdr_ptr) {
+                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem",
+                                     "**nomem %s", "MPIDI_CH3_Ext_pkt_put_derived_t");
+            }
+
+            dataloop_ptr = (void *) ((char *) ext_hdr_ptr +
+                                     sizeof(MPIDI_CH3_Ext_pkt_put_derived_t));
+            fill_in_derived_dtp_info(&ext_hdr_ptr->dtype_info, dataloop_ptr, target_dtp_ptr);
+        }
+
+        mpi_errno = issue_from_origin_buffer(rma_op, vc, ext_hdr_ptr, ext_hdr_sz,
+                                             0, rma_op->origin_count * origin_type_size, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -567,6 +486,9 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPI_Aint total_len, rest_len;
     MPI_Aint origin_dtp_size;
     MPID_Datatype *origin_dtp_ptr = NULL;
+    MPID_Datatype *target_dtp_ptr = NULL;
+    void *ext_hdr_ptr = NULL;
+    MPI_Aint ext_hdr_sz = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_ACC_OP);
 
@@ -625,6 +547,10 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     if (stream_unit_count > 1)
         flags |= MPIDI_CH3_PKT_FLAG_RMA_STREAM;
 
+    /* Get target datatype */
+    if (!MPIR_DATATYPE_IS_PREDEFINED(accum_pkt->datatype))
+        MPID_Datatype_get_ptr(accum_pkt->datatype, target_dtp_ptr);
+
     rest_len = total_len;
     MPIU_Assert(rma_op->issued_stream_count >= 0);
     for (j = 0; j < stream_unit_count; j++) {
@@ -650,7 +576,11 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         stream_size = MPIR_MIN(stream_elem_count * predefined_dtp_size, rest_len);
         rest_len -= stream_size;
 
-        mpi_errno = issue_from_origin_buffer(rma_op, vc, stream_offset, stream_size, &curr_req);
+        /* Set extended packet header if needed. */
+        init_accum_ext_pkt(flags, target_dtp_ptr, stream_offset, &ext_hdr_ptr, &ext_hdr_sz);
+
+        mpi_errno = issue_from_origin_buffer(rma_op, vc, ext_hdr_ptr, ext_hdr_sz,
+                                             stream_offset, stream_size, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
 
@@ -723,6 +653,8 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPI_Aint total_len, rest_len;
     MPI_Aint target_dtp_size;
     MPID_Datatype *target_dtp_ptr = NULL;
+    void *ext_hdr_ptr = NULL;
+    MPI_Aint ext_hdr_sz = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_GET_ACC_OP);
 
@@ -870,20 +802,11 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         stream_size = MPIR_MIN(stream_elem_count * predefined_dtp_size, rest_len);
         rest_len -= stream_size;
 
-        if (flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-            /* allocate extended packet header in request */
-            resp_req->dev.ext_hdr_ptr = MPIU_Malloc(sizeof(MPIDI_CH3_Ext_pkt_get_accum_t));
-            if (!resp_req->dev.ext_hdr_ptr) {
-                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                     "MPIDI_CH3_Ext_pkt_get_accum_t");
-            }
-            resp_req->dev.ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_get_accum_t);
-
-            ((MPIDI_CH3_Ext_pkt_get_accum_t *) resp_req->dev.ext_hdr_ptr)->stream_offset =
-                stream_offset;
-        }
+        /* Set extended packet header if needed. */
+        init_get_accum_ext_pkt(flags, target_dtp_ptr, stream_offset, &ext_hdr_ptr, &ext_hdr_sz);
 
-        mpi_errno = issue_from_origin_buffer(rma_op, vc, stream_offset, stream_size, &curr_req);
+        mpi_errno = issue_from_origin_buffer(rma_op, vc, ext_hdr_ptr, ext_hdr_sz,
+                                             stream_offset, stream_size, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
 
@@ -975,6 +898,8 @@ static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPI_Datatype target_datatype;
     MPID_Request *req = NULL;
     MPID_Request *curr_req = NULL;
+    MPIDI_CH3_Ext_pkt_get_derived_t *ext_hdr_ptr = NULL;
+    MPI_Aint ext_hdr_sz = 0;
     MPID_IOV iov[MPID_IOV_LIMIT];
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_GET_OP);
 
@@ -1020,30 +945,46 @@ static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPIU_THREAD_CS_EXIT(CH3COMM, vc);
     }
     else {
-        /* derived datatype on target. fill derived datatype info and
-         * send it along with get_pkt. */
+        /* derived datatype on target. */
         MPID_Datatype_get_ptr(target_datatype, dtp);
+        void *dataloop_ptr = NULL;
+
+        /* set extended packet header.
+         * dataloop is behind of extended header on origin.
+         * TODO: support extended header array */
+        ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_get_derived_t) + dtp->dataloop_size;
+        ext_hdr_ptr = MPIU_Malloc(ext_hdr_sz);
+        if (!ext_hdr_ptr) {
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem",
+                                 "**nomem %s", "MPIDI_CH3_Ext_pkt_get_derived_t");
+        }
 
-        mpi_errno = fill_in_derived_dtp_info(rma_op, dtp);
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
+        dataloop_ptr = (void *) ((char *) ext_hdr_ptr + sizeof(MPIDI_CH3_Ext_pkt_get_derived_t));
+        fill_in_derived_dtp_info(&ext_hdr_ptr->dtype_info, dataloop_ptr, dtp);
 
         /* Set dataloop size in pkt header */
         MPIDI_CH3_PKT_RMA_SET_DATALOOP_SIZE(rma_op->pkt, dtp->dataloop_size, mpi_errno);
 
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*get_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) & rma_op->dtype_info;
-        iov[1].MPID_IOV_LEN = sizeof(rma_op->dtype_info);
-        iov[2].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) rma_op->dataloop;
-        iov[2].MPID_IOV_LEN = dtp->dataloop_size;
+        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ext_hdr_ptr;
+        iov[1].MPID_IOV_LEN = ext_hdr_sz;
 
         MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, 3, &req);
+        mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, 2, &req);
         MPIU_THREAD_CS_EXIT(CH3COMM, vc);
 
         /* release the target datatype */
         MPID_Datatype_release(dtp);
+
+        /* If send is finished, we free extended header immediately.
+         * Otherwise, store its pointer in request thus it can be freed when request is freed.*/
+        if (req != NULL) {
+            req->dev.ext_hdr_ptr = ext_hdr_ptr;
+        }
+        else {
+            MPIU_Free(ext_hdr_ptr);
+        }
     }
 
     if (mpi_errno != MPI_SUCCESS) {
@@ -1183,7 +1124,8 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
     else {
         MPI_Aint origin_dtp_size;
         MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_dtp_size);
-        mpi_errno = issue_from_origin_buffer(rma_op, vc, 0, 1 * origin_dtp_size, &curr_req);
+        mpi_errno = issue_from_origin_buffer(rma_op, vc, NULL, 0,       /*ext_hdr_ptr, ext_hdr_sz */
+                                             0, 1 * origin_dtp_size, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 5737a98..1dba61b 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -66,7 +66,6 @@ static inline MPIDI_RMA_Op_t *MPIDI_CH3I_Win_op_alloc(MPID_Win * win_ptr)
         MPL_LL_DELETE(win_ptr->op_pool_head, win_ptr->op_pool_tail, e);
     }
 
-    e->dataloop = NULL;
     e->single_req = NULL;
     e->multi_reqs = NULL;
     e->reqs_size = 0;
@@ -90,11 +89,6 @@ static inline int MPIDI_CH3I_Win_op_free(MPID_Win * win_ptr, MPIDI_RMA_Op_t * e)
 {
     int mpi_errno = MPI_SUCCESS;
 
-    /* Check if we allocated a dataloop for this op (see send/recv_rma_msg) */
-    if (e->dataloop != NULL) {
-        MPIU_Free(e->dataloop);
-    }
-
     /* We enqueue elements to the right pool, so when they get freed
      * at window free time, they won't conflict with the global pool
      * or other windows */
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index f2ba4c3..cdb2897 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -23,23 +23,6 @@ enum MPIDI_RMA_Datatype {
  * a Request.
  */
 
-/* to send derived datatype across in RMA ops */
-typedef struct MPIDI_RMA_dtype_info {   /* for derived datatypes */
-    int is_contig;
-    MPI_Aint max_contig_blocks;
-    MPI_Aint size;
-    MPI_Aint extent;
-    MPI_Aint dataloop_size;     /* not needed because this info is sent in
-                                 * packet header. remove it after lock/unlock
-                                 * is implemented in the device */
-    void *dataloop;             /* pointer needed to update pointers
-                                 * within dataloop on remote side */
-    int dataloop_depth;
-    int basic_type;
-    MPI_Aint ub, lb, true_ub, true_lb;
-    int has_sticky_ub, has_sticky_lb;
-} MPIDI_RMA_dtype_info;
-
 typedef enum MPIDI_RMA_Pool_type {
     MPIDI_RMA_POOL_WIN = 6,
     MPIDI_RMA_POOL_GLOBAL = 7
@@ -66,9 +49,6 @@ typedef struct MPIDI_RMA_Op {
                                  * when reqs_size == 1, single_req is used;
                                  * when reqs_size > 1, multi_reqs is used. */
 
-    MPIDI_RMA_dtype_info dtype_info;
-    void *dataloop;
-
     int target_rank;
 
     MPIDI_CH3_Pkt_t pkt;
diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index 733400f..db59ad7 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -863,13 +863,60 @@ typedef union MPIDI_CH3_Pkt {
 #endif
 } MPIDI_CH3_Pkt_t;
 
-typedef struct MPIDI_CH3_Ext_pkt_accum {
+/* Extended header packet types */
+
+/* to send derived datatype across in RMA ops */
+typedef struct MPIDI_RMA_dtype_info {   /* for derived datatypes */
+    int is_contig;
+    MPI_Aint max_contig_blocks;
+    MPI_Aint size;
+    MPI_Aint extent;
+    MPI_Aint dataloop_size;     /* not needed because this info is sent in
+                                 * packet header. remove it after lock/unlock
+                                 * is implemented in the device */
+    void *dataloop;             /* pointer needed to update pointers
+                                 * within dataloop on remote side */
+    int dataloop_depth;
+    int basic_type;
+    MPI_Aint ub, lb, true_ub, true_lb;
+    int has_sticky_ub, has_sticky_lb;
+} MPIDI_RMA_dtype_info;
+
+typedef struct MPIDI_CH3_Ext_pkt_stream {
     MPI_Aint stream_offset;
-} MPIDI_CH3_Ext_pkt_accum_t;
-
-typedef struct MPIDI_CH3_Ext_pkt_get_accum {
+} MPIDI_CH3_Ext_pkt_stream_t;
+
+typedef struct MPIDI_CH3_Ext_pkt_derived {
+    MPIDI_RMA_dtype_info dtype_info;
+    /* Follow with variable-length dataloop.
+     * On origin we allocate a large buffer including
+     * this header and the dataloop; on target we use
+     * separate buffer to receive dataloop in order
+     * to avoid extra copy.*/
+} MPIDI_CH3_Ext_pkt_derived_t;
+
+typedef struct MPIDI_CH3_Ext_pkt_stream_derived {
     MPI_Aint stream_offset;
-} MPIDI_CH3_Ext_pkt_get_accum_t;
+    MPIDI_RMA_dtype_info dtype_info;
+    /* follow with variable-length dataloop. */
+} MPIDI_CH3_Ext_pkt_stream_derived_t;
+
+/* Note that since ACC and GET_ACC contain the same extended attributes,
+ * we use generic routines for them in some places (see below).
+ * If we add OP-specific attribute in future, we should handle them separately.
+ *  1. origin issuing function
+ *  2. target packet handler
+ *  3. target data receive complete handler. */
+typedef MPIDI_CH3_Ext_pkt_stream_t MPIDI_CH3_Ext_pkt_accum_stream_t;
+typedef MPIDI_CH3_Ext_pkt_derived_t MPIDI_CH3_Ext_pkt_accum_derived_t;
+typedef MPIDI_CH3_Ext_pkt_stream_derived_t MPIDI_CH3_Ext_pkt_accum_stream_derived_t;
+
+typedef MPIDI_CH3_Ext_pkt_stream_t MPIDI_CH3_Ext_pkt_get_accum_stream_t;
+typedef MPIDI_CH3_Ext_pkt_derived_t MPIDI_CH3_Ext_pkt_get_accum_derived_t;
+typedef MPIDI_CH3_Ext_pkt_stream_derived_t MPIDI_CH3_Ext_pkt_get_accum_stream_derived_t;
+
+typedef MPIDI_CH3_Ext_pkt_derived_t MPIDI_CH3_Ext_pkt_put_derived_t;
+typedef MPIDI_CH3_Ext_pkt_derived_t MPIDI_CH3_Ext_pkt_get_derived_t;
 
 #if defined(MPID_USE_SEQUENCE_NUMBERS)
 typedef struct MPIDI_CH3_Pkt_send_container {
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index 9f57a89..7838e20 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -424,8 +424,7 @@ typedef struct MPIDI_Request {
     MPI_Op op;
     /* For accumulate, since data is first read into a tmp_buf */
     void *real_user_buf;
-    /* For derived datatypes at target */
-    struct MPIDI_RMA_dtype_info *dtype_info;
+    /* For derived datatypes at target. */
     void *dataloop;
     /* req. handle needed to implement derived datatype gets.
      * It also used for remembering user request of request-based RMA operations. */
@@ -436,7 +435,9 @@ typedef struct MPIDI_Request {
     struct MPIDI_RMA_Target_lock_entry *target_lock_queue_entry;
     MPI_Request resp_request_handle; /* Handle for get_accumulate response */
 
-    void *ext_hdr_ptr; /* pointer to extended packet header */
+    void *ext_hdr_ptr; /* Pointer to extended packet header.
+                        * It is allocated in RMA issuing/pkt_handler functions,
+                        * and freed when release request. */
     MPIDI_msg_sz_t ext_hdr_sz;
 
     MPIDI_REQUEST_SEQNUM
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index 0441565..1b88762 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -399,13 +399,15 @@ static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
             MPIU_Assert(pkt->type == MPIDI_CH3_PKT_ACCUMULATE ||
                         pkt->type == MPIDI_CH3_PKT_GET_ACCUM);
 
+            /* Only basic datatype may contain piggyback lock.
+             * Thus we do not check extended header type for derived case.*/
             if (pkt->type == MPIDI_CH3_PKT_ACCUMULATE) {
-                recv_data_sz += sizeof(MPIDI_CH3_Ext_pkt_accum_t);
-                buf_size += sizeof(MPIDI_CH3_Ext_pkt_accum_t);
+                recv_data_sz += sizeof(MPIDI_CH3_Ext_pkt_accum_stream_t);
+                buf_size += sizeof(MPIDI_CH3_Ext_pkt_accum_stream_t);
             }
             else {
-                recv_data_sz += sizeof(MPIDI_CH3_Ext_pkt_get_accum_t);
-                buf_size += sizeof(MPIDI_CH3_Ext_pkt_get_accum_t);
+                recv_data_sz += sizeof(MPIDI_CH3_Ext_pkt_get_accum_stream_t);
+                buf_size += sizeof(MPIDI_CH3_Ext_pkt_get_accum_stream_t);
             }
         }
 
@@ -1157,4 +1159,28 @@ static inline int poke_progress_engine(void)
     goto fn_exit;
 }
 
+static inline void MPIDI_CH3_ExtPkt_Accum_get_stream(MPIDI_CH3_Pkt_flags_t flags,
+                                                     int is_derived_dt, void *ext_hdr_ptr,
+                                                     MPI_Aint * stream_offset)
+{
+    if ((flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) && is_derived_dt) {
+        MPIU_Assert(ext_hdr_ptr != NULL);
+        (*stream_offset) =
+            ((MPIDI_CH3_Ext_pkt_accum_stream_derived_t *) ext_hdr_ptr)->stream_offset;
+    }
+    else if (flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
+        MPIU_Assert(ext_hdr_ptr != NULL);
+        (*stream_offset) = ((MPIDI_CH3_Ext_pkt_accum_stream_t *) ext_hdr_ptr)->stream_offset;
+    }
+}
+
+static inline void MPIDI_CH3_ExtPkt_Gaccum_get_stream(MPIDI_CH3_Pkt_flags_t flags,
+                                                      int is_derived_dt, void *ext_hdr_ptr,
+                                                      MPI_Aint * stream_offset)
+{
+    /* We do not check packet match here, because error must have already been
+     * reported at header init time (on origin) and at packet receive time (on target).  */
+    return MPIDI_CH3_ExtPkt_Accum_get_stream(flags, is_derived_dt, ext_hdr_ptr, stream_offset);
+}
+
 #endif /* MPID_RMA_H_INCLUDED */
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index c9d4531..6bc7d87 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -7,7 +7,8 @@
 #include "mpidimpl.h"
 #include "mpidrma.h"
 
-static int create_derived_datatype(MPID_Request * rreq, MPID_Datatype ** dtp);
+static int create_derived_datatype(MPID_Request * req, MPIDI_RMA_dtype_info * dtype_info,
+                                   MPID_Datatype ** dtp);
 
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3U_Handle_recv_req
@@ -177,12 +178,10 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq,
     predef_count = rreq->dev.recv_data_sz / predef_dtp_size;
     MPIU_Assert(predef_count > 0);
 
-    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-        MPIU_Assert(rreq->dev.ext_hdr_ptr != NULL);
-        stream_offset = ((MPIDI_CH3_Ext_pkt_accum_t *) rreq->dev.ext_hdr_ptr)->stream_offset;
-    }
-    else
-        stream_offset = 0;
+    stream_offset = 0;
+    MPIDI_CH3_ExtPkt_Accum_get_stream(flags,
+                                      (!MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype)),
+                                      rreq->dev.ext_hdr_ptr, &stream_offset);
 
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
@@ -265,13 +264,10 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     }
     MPIU_Assert(basic_type != MPI_DATATYPE_NULL);
 
-    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-        MPIU_Assert(rreq->dev.ext_hdr_ptr != NULL);
-        stream_offset = ((MPIDI_CH3_Ext_pkt_get_accum_t *) rreq->dev.ext_hdr_ptr)->stream_offset;
-    }
-    else {
-        stream_offset = 0;
-    }
+    stream_offset = 0;
+    MPIDI_CH3_ExtPkt_Gaccum_get_stream(rreq->dev.flags,
+                                       (!MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype)),
+                                       rreq->dev.ext_hdr_ptr, &stream_offset);
 
     /* Use target data to calculate current stream unit size */
     MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
@@ -537,12 +533,15 @@ int MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((unu
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
+    MPIDI_RMA_dtype_info *dtype_info = NULL;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
 
+    /* get data from extended header */
+    dtype_info = &((MPIDI_CH3_Ext_pkt_put_derived_t *) rreq->dev.ext_hdr_ptr)->dtype_info;
     /* create derived datatype */
-    create_derived_datatype(rreq, &new_dtp);
+    create_derived_datatype(rreq, dtype_info, &new_dtp);
 
     /* update request to get the data */
     MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_PUT_RECV);
@@ -550,9 +549,6 @@ int MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((unu
     rreq->dev.recv_data_sz = new_dtp->size * rreq->dev.user_count;
 
     rreq->dev.datatype_ptr = new_dtp;
-    /* this will cause the datatype to be freed when the
-     * request is freed. free dtype_info here. */
-    MPIU_Free(rreq->dev.dtype_info);
 
     rreq->dev.segment_ptr = MPID_Segment_alloc();
     MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem",
@@ -585,6 +581,7 @@ int MPIDI_CH3_ReqHandler_AccumMetadataRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((un
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
+    MPIDI_RMA_dtype_info *dtype_info = NULL;
     MPI_Aint basic_type_extent, basic_type_size;
     MPI_Aint total_len, rest_len, stream_elem_count;
     MPI_Aint stream_offset;
@@ -594,16 +591,25 @@ int MPIDI_CH3_ReqHandler_AccumMetadataRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((un
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMMETADATARECVCOMPLETE);
 
-    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-        MPIU_Assert(rreq->dev.ext_hdr_ptr != NULL);
-        stream_offset = ((MPIDI_CH3_Ext_pkt_accum_t *) rreq->dev.ext_hdr_ptr)->stream_offset;
-    }
-    else
-        stream_offset = 0;
+    stream_offset = 0;
+    MPIU_Assert(rreq->dev.ext_hdr_ptr != NULL);
 
     if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RECV_DERIVED_DT) {
+        /* get data from extended header */
+        if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
+            MPIDI_CH3_Ext_pkt_accum_stream_derived_t *ext_hdr = NULL;
+            ext_hdr = ((MPIDI_CH3_Ext_pkt_accum_stream_derived_t *) rreq->dev.ext_hdr_ptr);
+            stream_offset = ext_hdr->stream_offset;
+            dtype_info = &ext_hdr->dtype_info;
+        }
+        else {
+            MPIDI_CH3_Ext_pkt_accum_derived_t *ext_hdr = NULL;
+            ext_hdr = ((MPIDI_CH3_Ext_pkt_accum_derived_t *) rreq->dev.ext_hdr_ptr);
+            dtype_info = &ext_hdr->dtype_info;
+        }
+
         /* create derived datatype */
-        create_derived_datatype(rreq, &new_dtp);
+        create_derived_datatype(rreq, dtype_info, &new_dtp);
 
         /* update new request to get the data */
         MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_ACCUM_RECV);
@@ -611,9 +617,6 @@ int MPIDI_CH3_ReqHandler_AccumMetadataRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((un
         MPIU_Assert(rreq->dev.datatype == MPI_DATATYPE_NULL);
         rreq->dev.datatype = new_dtp->handle;
         rreq->dev.datatype_ptr = new_dtp;
-        /* this will cause the datatype to be freed when the
-         * request is freed. free dtype_info here. */
-        MPIU_Free(rreq->dev.dtype_info);
 
         type_size = new_dtp->size;
 
@@ -623,6 +626,13 @@ int MPIDI_CH3_ReqHandler_AccumMetadataRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((un
         MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RECV);
         MPIU_Assert(rreq->dev.datatype != MPI_DATATYPE_NULL);
 
+        /* get data from extended header */
+        if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
+            MPIDI_CH3_Ext_pkt_accum_stream_t *ext_hdr = NULL;
+            ext_hdr = ((MPIDI_CH3_Ext_pkt_accum_stream_t *) rreq->dev.ext_hdr_ptr);
+            stream_offset = ext_hdr->stream_offset;
+        }
+
         MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
 
         basic_dtp = rreq->dev.datatype;
@@ -686,6 +696,7 @@ int MPIDI_CH3_ReqHandler_GaccumMetadataRecvComplete(MPIDI_VC_t * vc,
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
+    MPIDI_RMA_dtype_info *dtype_info = NULL;
     MPI_Aint basic_type_extent, basic_type_size;
     MPI_Aint total_len, rest_len, stream_elem_count;
     MPI_Aint stream_offset;
@@ -701,16 +712,25 @@ int MPIDI_CH3_ReqHandler_GaccumMetadataRecvComplete(MPIDI_VC_t * vc,
         is_empty_origin = TRUE;
     }
 
-    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-        MPIU_Assert(rreq->dev.ext_hdr_ptr != NULL);
-        stream_offset = ((MPIDI_CH3_Ext_pkt_get_accum_t *) rreq->dev.ext_hdr_ptr)->stream_offset;
-    }
-    else
-        stream_offset = 0;
+    stream_offset = 0;
+    MPIU_Assert(rreq->dev.ext_hdr_ptr != NULL);
 
     if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RECV_DERIVED_DT) {
+        /* get data from extended header */
+        if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
+            MPIDI_CH3_Ext_pkt_get_accum_stream_derived_t *ext_hdr = NULL;
+            ext_hdr = ((MPIDI_CH3_Ext_pkt_get_accum_stream_derived_t *) rreq->dev.ext_hdr_ptr);
+            stream_offset = ext_hdr->stream_offset;
+            dtype_info = &ext_hdr->dtype_info;
+        }
+        else {
+            MPIDI_CH3_Ext_pkt_get_accum_derived_t *ext_hdr = NULL;
+            ext_hdr = ((MPIDI_CH3_Ext_pkt_get_accum_derived_t *) rreq->dev.ext_hdr_ptr);
+            dtype_info = &ext_hdr->dtype_info;
+        }
+
         /* create derived datatype */
-        create_derived_datatype(rreq, &new_dtp);
+        create_derived_datatype(rreq, dtype_info, &new_dtp);
 
         /* update new request to get the data */
         MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_GET_ACCUM_RECV);
@@ -718,9 +738,6 @@ int MPIDI_CH3_ReqHandler_GaccumMetadataRecvComplete(MPIDI_VC_t * vc,
         MPIU_Assert(rreq->dev.datatype == MPI_DATATYPE_NULL);
         rreq->dev.datatype = new_dtp->handle;
         rreq->dev.datatype_ptr = new_dtp;
-        /* this will cause the datatype to be freed when the
-         * request is freed. free dtype_info here. */
-        MPIU_Free(rreq->dev.dtype_info);
 
         type_size = new_dtp->size;
 
@@ -730,6 +747,13 @@ int MPIDI_CH3_ReqHandler_GaccumMetadataRecvComplete(MPIDI_VC_t * vc,
         MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RECV);
         MPIU_Assert(rreq->dev.datatype != MPI_DATATYPE_NULL);
 
+        /* get data from extended header */
+        if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
+            MPIDI_CH3_Ext_pkt_get_accum_stream_t *ext_hdr = NULL;
+            ext_hdr = ((MPIDI_CH3_Ext_pkt_get_accum_stream_t *) rreq->dev.ext_hdr_ptr);
+            stream_offset = ext_hdr->stream_offset;
+        }
+
         MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
 
         basic_dtp = rreq->dev.datatype;
@@ -805,6 +829,7 @@ int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete(MPIDI_VC_t * vc,
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
+    MPIDI_RMA_dtype_info *dtype_info = NULL;
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_get_resp_t *get_resp_pkt = &upkt.get_resp;
     MPID_Request *sreq;
@@ -817,9 +842,10 @@ int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete(MPIDI_VC_t * vc,
 
     MPIU_Assert(!(rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP));
 
+    /* get data from extended header */
+    dtype_info = &((MPIDI_CH3_Ext_pkt_get_derived_t *) rreq->dev.ext_hdr_ptr)->dtype_info;
     /* create derived datatype */
-    create_derived_datatype(rreq, &new_dtp);
-    MPIU_Free(rreq->dev.dtype_info);
+    create_derived_datatype(rreq, dtype_info, &new_dtp);
 
     /* create request for sending data */
     sreq = MPID_Request_create();
@@ -999,9 +1025,9 @@ int MPIDI_CH3_ReqHandler_ReloadIOV(MPIDI_VC_t * vc ATTRIBUTE((unused)),
 #define FUNCNAME create_derived_datatype
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int create_derived_datatype(MPID_Request * req, MPID_Datatype ** dtp)
+static int create_derived_datatype(MPID_Request * req, MPIDI_RMA_dtype_info * dtype_info,
+                                   MPID_Datatype ** dtp)
 {
-    MPIDI_RMA_dtype_info *dtype_info;
     MPID_Datatype *new_dtp;
     int mpi_errno = MPI_SUCCESS;
     MPI_Aint ptrdiff;
@@ -1009,8 +1035,6 @@ static int create_derived_datatype(MPID_Request * req, MPID_Datatype ** dtp)
 
     MPIDI_FUNC_ENTER(MPID_STATE_CREATE_DERIVED_DATATYPE);
 
-    dtype_info = req->dev.dtype_info;
-
     /* allocate new datatype object and handle */
     new_dtp = (MPID_Datatype *) MPIU_Handle_obj_alloc(&MPID_Datatype_mem);
     if (!new_dtp) {
@@ -1951,10 +1975,12 @@ int MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete(MPIDI_VC_t * vc,
 
             int ext_hdr_sz;
 
+            /* only basic datatype may contain piggyback lock.
+             * Thus we do not check extended header type for derived case.*/
             if (target_lock_queue_entry->pkt.type == MPIDI_CH3_PKT_ACCUMULATE)
-                ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_accum_t);
+                ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_accum_stream_t);
             else
-                ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_get_accum_t);
+                ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_get_accum_stream_t);
 
             /* here we drop the stream_offset received, because the stream unit that piggybacked with
              * LOCK must be the first stream unit, with stream_offset equals to 0. */
diff --git a/src/mpid/ch3/src/ch3u_request.c b/src/mpid/ch3/src/ch3u_request.c
index d62ae80..3400fab 100644
--- a/src/mpid/ch3/src/ch3u_request.c
+++ b/src/mpid/ch3/src/ch3u_request.c
@@ -84,7 +84,6 @@ MPID_Request * MPID_Request_create(void)
 	req->dev.target_win_handle = MPI_WIN_NULL;
 	req->dev.source_win_handle = MPI_WIN_NULL;
         req->dev.target_lock_queue_entry = NULL;
-	req->dev.dtype_info	   = NULL;
 	req->dev.dataloop	   = NULL;
 	req->dev.iov_offset        = 0;
         req->dev.flags             = MPIDI_CH3_PKT_FLAG_NONE;
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 2c9cb33..033b999 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -174,6 +174,78 @@ void MPIDI_CH3_RMA_Init_pkthandler_pvars(void)
                                       "RMA", "RMA:PKTHANDLER for Decr-At-Cnt (in seconds)");
 }
 
+/* =========================================================== */
+/*                  extended packet functions                  */
+/* =========================================================== */
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3_ExtPktHandler_Accumulate
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPIDI_CH3_ExtPktHandler_Accumulate(MPIDI_CH3_Pkt_flags_t flags,
+                                              int is_derived_dt, void **ext_hdr_ptr,
+                                              MPI_Aint * ext_hdr_sz)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_EXTPKTHANDLER_ACCUMULATE);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_EXTPKTHANDLER_ACCUMULATE);
+
+    if ((flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) && is_derived_dt) {
+        (*ext_hdr_sz) = sizeof(MPIDI_CH3_Ext_pkt_accum_stream_derived_t);
+        (*ext_hdr_ptr) = MPIU_Malloc(sizeof(MPIDI_CH3_Ext_pkt_accum_stream_derived_t));
+        if ((*ext_hdr_ptr) == NULL) {
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem",
+                                 "**nomem %s", "MPIDI_CH3_Ext_pkt_accum_stream_derived_t");
+        }
+    }
+    else if (flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
+        (*ext_hdr_sz) = sizeof(MPIDI_CH3_Ext_pkt_accum_stream_t);
+        (*ext_hdr_ptr) = MPIU_Malloc(sizeof(MPIDI_CH3_Ext_pkt_accum_stream_t));
+        if ((*ext_hdr_ptr) == NULL) {
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem",
+                                 "**nomem %s", "MPIDI_CH3_Ext_pkt_accum_stream_t");
+        }
+    }
+    else if (is_derived_dt) {
+        (*ext_hdr_sz) = sizeof(MPIDI_CH3_Ext_pkt_accum_derived_t);
+        (*ext_hdr_ptr) = MPIU_Malloc(sizeof(MPIDI_CH3_Ext_pkt_accum_derived_t));
+        if ((*ext_hdr_ptr) == NULL) {
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem",
+                                 "**nomem %s", "MPIDI_CH3_Ext_pkt_accum_derived_t");
+        }
+    }
+
+  fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_EXTPKTHANDLER_ACCUMULATE);
+    return mpi_errno;
+  fn_fail:
+    if ((*ext_hdr_ptr) != NULL)
+        MPIU_Free((*ext_hdr_ptr));
+    (*ext_hdr_ptr) = NULL;
+    (*ext_hdr_sz) = 0;
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3_ExtPktHandler_GetAccumulate
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPIDI_CH3_ExtPktHandler_GetAccumulate(MPIDI_CH3_Pkt_flags_t flags,
+                                                 int is_derived_dt, void **ext_hdr_ptr,
+                                                 MPI_Aint * ext_hdr_sz)
+{
+    /* Check if get_accum still reuses accum' extended packet header. */
+    MPIU_Assert(sizeof(MPIDI_CH3_Ext_pkt_accum_stream_derived_t) ==
+                sizeof(MPIDI_CH3_Ext_pkt_get_accum_stream_derived_t));
+    MPIU_Assert(sizeof(MPIDI_CH3_Ext_pkt_accum_derived_t) ==
+                sizeof(MPIDI_CH3_Ext_pkt_get_accum_derived_t));
+    MPIU_Assert(sizeof(MPIDI_CH3_Ext_pkt_accum_stream_t) ==
+                sizeof(MPIDI_CH3_Ext_pkt_get_accum_stream_t));
+
+    return MPIDI_CH3_ExtPktHandler_Accumulate(flags, is_derived_dt, ext_hdr_ptr, ext_hdr_sz);
+}
+
 /* ------------------------------------------------------------------------ */
 /*
  * The following routines are the packet handlers for the packet types
@@ -286,13 +358,17 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RECV_DERIVED_DT);
             req->dev.datatype = MPI_DATATYPE_NULL;
 
-            req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
-                MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
-            if (!req->dev.dtype_info) {
+            /* allocate extended header in the request,
+             * only including fixed-length variables defined in packet type. */
+            req->dev.ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_put_derived_t);
+            req->dev.ext_hdr_ptr = MPIU_Malloc(req->dev.ext_hdr_sz);
+            if (!req->dev.ext_hdr_ptr) {
                 MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                     "MPIDI_RMA_dtype_info");
+                                     "MPIDI_CH3_Ext_pkt_put_derived_t");
             }
 
+            /* put dataloop in a separate buffer to be reused in datatype.
+             * It will be freed when free datatype. */
             req->dev.dataloop = MPIU_Malloc(put_pkt->info.dataloop_size);
             if (!req->dev.dataloop) {
                 MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
@@ -302,14 +378,13 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             /* if we received all of the dtype_info and dataloop, copy it
              * now and call the handler, otherwise set the iov and let the
              * channel copy it */
-            if (data_len >= sizeof(MPIDI_RMA_dtype_info) + put_pkt->info.dataloop_size) {
-                /* copy all of dtype_info and dataloop */
-                MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
-                MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
+            if (data_len >= req->dev.ext_hdr_sz + put_pkt->info.dataloop_size) {
+                /* Copy extended header */
+                MPIU_Memcpy(req->dev.ext_hdr_ptr, data_buf, req->dev.ext_hdr_sz);
+                MPIU_Memcpy(req->dev.dataloop, data_buf + req->dev.ext_hdr_sz,
                             put_pkt->info.dataloop_size);
 
-                *buflen =
-                    sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) +
+                *buflen = sizeof(MPIDI_CH3_Pkt_t) + req->dev.ext_hdr_sz +
                     put_pkt->info.dataloop_size;
 
                 /* All dtype data has been received, call req handler */
@@ -322,8 +397,8 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                 }
             }
             else {
-                req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) req->dev.dtype_info);
-                req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
+                req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) req->dev.ext_hdr_ptr);
+                req->dev.iov[0].MPID_IOV_LEN = req->dev.ext_hdr_sz;
                 req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
                 req->dev.iov[1].MPID_IOV_LEN = put_pkt->info.dataloop_size;
                 req->dev.iov_count = 2;
@@ -512,13 +587,17 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         req->dev.datatype = MPI_DATATYPE_NULL;
         req->dev.request_handle = get_pkt->request_handle;
 
-        req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
-            MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
-        if (!req->dev.dtype_info) {
+        /* allocate extended header in the request,
+         * only including fixed-length variables defined in packet type. */
+        req->dev.ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_get_derived_t);
+        req->dev.ext_hdr_ptr = MPIU_Malloc(req->dev.ext_hdr_sz);
+        if (!req->dev.ext_hdr_ptr) {
             MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                 "MPIDI_RMA_dtype_info");
+                                 "MPIDI_CH3_Ext_pkt_get_derived_t");
         }
 
+        /* put dataloop in a separate buffer to be reused in datatype.
+         * It will be freed when free datatype. */
         req->dev.dataloop = MPIU_Malloc(get_pkt->info.dataloop_size);
         if (!req->dev.dataloop) {
             MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
@@ -528,15 +607,13 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         /* if we received all of the dtype_info and dataloop, copy it
          * now and call the handler, otherwise set the iov and let the
          * channel copy it */
-        if (data_len >= sizeof(MPIDI_RMA_dtype_info) + get_pkt->info.dataloop_size) {
-            /* copy all of dtype_info and dataloop */
-            MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
-            MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
+        if (data_len >= req->dev.ext_hdr_sz + get_pkt->info.dataloop_size) {
+            /* Copy extended header */
+            MPIU_Memcpy(req->dev.ext_hdr_ptr, data_buf, req->dev.ext_hdr_sz);
+            MPIU_Memcpy(req->dev.dataloop, data_buf + req->dev.ext_hdr_sz,
                         get_pkt->info.dataloop_size);
 
-            *buflen =
-                sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) +
-                get_pkt->info.dataloop_size;
+            *buflen = sizeof(MPIDI_CH3_Pkt_t) + req->dev.ext_hdr_sz + get_pkt->info.dataloop_size;
 
             /* All dtype data has been received, call req handler */
             mpi_errno = MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete(vc, req, &complete);
@@ -546,8 +623,8 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                 *rreqp = NULL;
         }
         else {
-            req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dtype_info;
-            req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
+            req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) req->dev.ext_hdr_ptr);
+            req->dev.iov[0].MPID_IOV_LEN = req->dev.ext_hdr_sz;
             req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
             req->dev.iov[1].MPID_IOV_LEN = get_pkt->info.dataloop_size;
             req->dev.iov_count = 2;
@@ -647,15 +724,14 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
         data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
-        if (req->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-            /* allocate extended header in the request */
-            req->dev.ext_hdr_ptr = MPIU_Malloc(sizeof(MPIDI_CH3_Ext_pkt_accum_t));
-            if (!req->dev.ext_hdr_ptr) {
-                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                     "MPIDI_CH3_Ext_pkt_accum_t");
-            }
-            req->dev.ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_accum_t);
-        }
+        /* allocate extended header in the request,
+         * only including fixed-length variables defined in packet type. */
+        mpi_errno = MPIDI_CH3_ExtPktHandler_Accumulate(req->dev.flags,
+                                                       (!MPIR_DATATYPE_IS_PREDEFINED
+                                                        (accum_pkt->datatype)),
+                                                       &req->dev.ext_hdr_ptr, &req->dev.ext_hdr_sz);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         if (MPIR_DATATYPE_IS_PREDEFINED(accum_pkt->datatype)) {
             MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RECV);
@@ -719,52 +795,28 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             }
         }
         else {
-            int metadata_sz = 0;
-
             MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RECV_DERIVED_DT);
             req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumMetadataRecvComplete;
             req->dev.datatype = MPI_DATATYPE_NULL;
 
-            if (accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM)
-                metadata_sz += sizeof(MPIDI_CH3_Ext_pkt_accum_t);
-
-            req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
-                MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
-            if (!req->dev.dtype_info) {
-                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                     "MPIDI_RMA_dtype_info");
-            }
-            metadata_sz += sizeof(MPIDI_RMA_dtype_info);
-
+            /* Put dataloop in a separate buffer to be reused in datatype.
+             * It will be freed when free datatype. */
             req->dev.dataloop = MPIU_Malloc(accum_pkt->info.dataloop_size);
             if (!req->dev.dataloop) {
                 MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
                                      accum_pkt->info.dataloop_size);
             }
-            metadata_sz += accum_pkt->info.dataloop_size;
-
-            if (data_len >= metadata_sz) {
-                int buf_offset = 0;
-
-                if (accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-                    /* copy extended header */
-                    MPIU_Memcpy(req->dev.ext_hdr_ptr, data_buf + buf_offset,
-                                sizeof(MPIDI_CH3_Ext_pkt_accum_t));
-                    buf_offset += sizeof(MPIDI_CH3_Ext_pkt_accum_t);
-                }
-
-                /* copy all of dtype_info and dataloop */
-                MPIU_Memcpy(req->dev.dtype_info, data_buf + buf_offset,
-                            sizeof(MPIDI_RMA_dtype_info));
-                buf_offset += sizeof(MPIDI_RMA_dtype_info);
 
-                MPIU_Memcpy(req->dev.dataloop, data_buf + buf_offset,
+            if (data_len >= req->dev.ext_hdr_sz + accum_pkt->info.dataloop_size) {
+                /* Copy extended header */
+                MPIU_Memcpy(req->dev.ext_hdr_ptr, data_buf, req->dev.ext_hdr_sz);
+                MPIU_Memcpy(req->dev.dataloop, data_buf + req->dev.ext_hdr_sz,
                             accum_pkt->info.dataloop_size);
-                buf_offset += accum_pkt->info.dataloop_size;
 
-                *buflen = sizeof(MPIDI_CH3_Pkt_t) + metadata_sz;
+                *buflen = sizeof(MPIDI_CH3_Pkt_t) + req->dev.ext_hdr_sz +
+                    accum_pkt->info.dataloop_size;
 
-                /* All dtype data has been received, call req handler */
+                /* All extended data has been received, call req handler */
                 mpi_errno = MPIDI_CH3_ReqHandler_AccumMetadataRecvComplete(vc, req, &complete);
                 MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
                                      "**ch3|postrecv %s", "MPIDI_CH3_ACCUMULATE");
@@ -774,23 +826,14 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                 }
             }
             else {
-                int iov_n = 0;
-
-                if (accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-                    req->dev.iov[iov_n].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.ext_hdr_ptr;
-                    req->dev.iov[iov_n].MPID_IOV_LEN = req->dev.ext_hdr_sz;
-                    iov_n++;
-                }
-
-                req->dev.iov[iov_n].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dtype_info;
-                req->dev.iov[iov_n].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
-                iov_n++;
-
-                req->dev.iov[iov_n].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
-                req->dev.iov[iov_n].MPID_IOV_LEN = accum_pkt->info.dataloop_size;
-                iov_n++;
+                /* Prepare to receive extended header.
+                 * All variable-length data can be received in separate iovs. */
+                req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.ext_hdr_ptr;
+                req->dev.iov[0].MPID_IOV_LEN = req->dev.ext_hdr_sz;
+                req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
+                req->dev.iov[1].MPID_IOV_LEN = accum_pkt->info.dataloop_size;
+                req->dev.iov_count = 2;
 
-                req->dev.iov_count = iov_n;
                 *buflen = sizeof(MPIDI_CH3_Pkt_t);
             }
 
@@ -934,6 +977,8 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             MPIU_ERR_POP(mpi_errno);
     }
     else {
+        int is_derived_dt = 0;
+
         MPIU_Assert(pkt->type == MPIDI_CH3_PKT_GET_ACCUM);
 
         req = MPID_Request_create();
@@ -952,15 +997,14 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
         data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
-        if (req->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-            /* allocate extended header in the request */
-            req->dev.ext_hdr_ptr = MPIU_Malloc(sizeof(MPIDI_CH3_Ext_pkt_get_accum_t));
-            if (!req->dev.ext_hdr_ptr) {
-                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                     "MPIDI_CH3_Ext_pkt_get_accum_t");
-            }
-            req->dev.ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_get_accum_t);
-        }
+        /* allocate extended header in the request,
+         * only including fixed-length variables defined in packet type. */
+        is_derived_dt = !MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype);
+        mpi_errno = MPIDI_CH3_ExtPktHandler_GetAccumulate(req->dev.flags, is_derived_dt,
+                                                          &req->dev.ext_hdr_ptr,
+                                                          &req->dev.ext_hdr_sz);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         if (MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype)) {
             MPI_Aint type_size;
@@ -1039,50 +1083,26 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             }
         }
         else {
-            int metadata_sz = 0;
-
             MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RECV_DERIVED_DT);
             req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumMetadataRecvComplete;
             req->dev.datatype = MPI_DATATYPE_NULL;
 
-            if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM)
-                metadata_sz += sizeof(MPIDI_CH3_Ext_pkt_get_accum_t);
-
-            req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
-                MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
-            if (!req->dev.dtype_info) {
-                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                     "MPIDI_RMA_dtype_info");
-            }
-            metadata_sz += sizeof(MPIDI_RMA_dtype_info);
-
+            /* Put dataloop in a separate buffer to be reused in datatype.
+             * It will be freed when free datatype. */
             req->dev.dataloop = MPIU_Malloc(get_accum_pkt->info.dataloop_size);
             if (!req->dev.dataloop) {
                 MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
                                      get_accum_pkt->info.dataloop_size);
             }
-            metadata_sz += get_accum_pkt->info.dataloop_size;
-
-            if (data_len >= metadata_sz) {
-                int buf_offset = 0;
-
-                if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-                    /* copy extended header */
-                    MPIU_Memcpy(req->dev.ext_hdr_ptr, data_buf + buf_offset,
-                                sizeof(MPIDI_CH3_Ext_pkt_get_accum_t));
-                    buf_offset += sizeof(MPIDI_CH3_Ext_pkt_get_accum_t);
-                }
 
-                /* copy all of dtype_info and dataloop */
-                MPIU_Memcpy(req->dev.dtype_info, data_buf + buf_offset,
-                            sizeof(MPIDI_RMA_dtype_info));
-                buf_offset += sizeof(MPIDI_RMA_dtype_info);
-
-                MPIU_Memcpy(req->dev.dataloop, data_buf + buf_offset,
+            if (data_len >= req->dev.ext_hdr_sz + get_accum_pkt->info.dataloop_size) {
+                /* Copy extended header */
+                MPIU_Memcpy(req->dev.ext_hdr_ptr, data_buf, req->dev.ext_hdr_sz);
+                MPIU_Memcpy(req->dev.dataloop, data_buf + req->dev.ext_hdr_sz,
                             get_accum_pkt->info.dataloop_size);
-                buf_offset += get_accum_pkt->info.dataloop_size;
 
-                *buflen = sizeof(MPIDI_CH3_Pkt_t) + metadata_sz;
+                *buflen = sizeof(MPIDI_CH3_Pkt_t) + req->dev.ext_hdr_sz +
+                    get_accum_pkt->info.dataloop_size;
 
                 /* All dtype data has been received, call req handler */
                 mpi_errno = MPIDI_CH3_ReqHandler_GaccumMetadataRecvComplete(vc, req, &complete);
@@ -1094,23 +1114,14 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                 }
             }
             else {
-                int iov_n = 0;
-
-                if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM) {
-                    req->dev.iov[iov_n].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.ext_hdr_ptr;
-                    req->dev.iov[iov_n].MPID_IOV_LEN = req->dev.ext_hdr_sz;
-                    iov_n++;
-                }
-
-                req->dev.iov[iov_n].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dtype_info;
-                req->dev.iov[iov_n].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
-                iov_n++;
-
-                req->dev.iov[iov_n].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
-                req->dev.iov[iov_n].MPID_IOV_LEN = get_accum_pkt->info.dataloop_size;
-                iov_n++;
+                /* Prepare to receive extended header.
+                 * All variable-length data can be received in separate iovs. */
+                req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.ext_hdr_ptr;
+                req->dev.iov[0].MPID_IOV_LEN = req->dev.ext_hdr_sz;
+                req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
+                req->dev.iov[1].MPID_IOV_LEN = get_accum_pkt->info.dataloop_size;
+                req->dev.iov_count = 2;
 
-                req->dev.iov_count = iov_n;
                 *buflen = sizeof(MPIDI_CH3_Pkt_t);
             }
 
@@ -1657,9 +1668,10 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         MPID_Datatype_get_extent_macro(basic_type, basic_type_extent);
         MPID_Datatype_get_size_macro(basic_type, basic_type_size);
 
-        if (req->dev.ext_hdr_ptr != NULL)
-            contig_stream_offset =
-                ((MPIDI_CH3_Ext_pkt_get_accum_t *) req->dev.ext_hdr_ptr)->stream_offset;
+        /* get stream_offset from extended header */
+        MPIDI_CH3_ExtPkt_Gaccum_get_stream(req->dev.flags,
+                                           (!MPIR_DATATYPE_IS_PREDEFINED(req->dev.datatype)),
+                                           req->dev.ext_hdr_ptr, &contig_stream_offset);
 
         total_len = type_size * req->dev.user_count;
         rest_len = total_len - contig_stream_offset;

http://git.mpich.org/mpich.git/commitdiff/f49534e19e7fa40956e45fdcbd4933d66563332e

commit f49534e19e7fa40956e45fdcbd4933d66563332e
Author: Min Si <msi at il.is.s.u-tokyo.ac.jp>
Date:   Tue Jun 23 13:09:28 2015 -0500

    Bugfix: free internal req object in fn_fail.
    
    In fn_fail path, we should free the internal req object since req_ptr is
    pointed to the req object only at fn_exit.
    
    Signed-off-by: Xin Zhao <xinzhao3 at illinois.edu>
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index 5a0430e..2172e7e 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -483,10 +483,10 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
     MPIDI_FUNC_EXIT(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER);
     return mpi_errno;
   fn_fail:
-    if ((*req_ptr)) {
-        if ((*req_ptr)->dev.datatype_ptr)
-            MPID_Datatype_release((*req_ptr)->dev.datatype_ptr);
-        MPID_Request_release((*req_ptr));
+    if (req) {
+        if (req->dev.datatype_ptr)
+            MPID_Datatype_release(req->dev.datatype_ptr);
+        MPID_Request_release(req);
     }
     (*req_ptr) = NULL;
     goto fn_exit;

http://git.mpich.org/mpich.git/commitdiff/5f54498761a43df407aa4e9bfff9f32c6a47960e

commit 5f54498761a43df407aa4e9bfff9f32c6a47960e
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Fri Jun 26 10:03:39 2015 -0500

    Modify pkt/flag/function name "FLUSH_ACK" to "ACK".
    
    Because we use the same ACK type packet/flag for the acknowledgement
    of FLUSH/UNLOCK/DECR_AT_CNT packets, it is misleading if we call
    the ACK packet/flag "FLUSH_ACK". This patch renames related packet,
    flag and function name from "FLUSH_ACK" to "ACK".
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index ddcc336..c0e5ce0 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -1890,8 +1890,8 @@ int MPIDI_CH3_PktHandler_Unlock( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
                                  MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_Flush( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
                                 MPIDI_msg_sz_t *, MPID_Request ** );
-int MPIDI_CH3_PktHandler_FlushAck( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
-				    MPIDI_msg_sz_t *, MPID_Request ** );
+int MPIDI_CH3_PktHandler_Ack( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
+                              MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_DecrAtCnt( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
                                     MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_FlowCntlUpdate( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index d2ca537..733400f 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -97,7 +97,7 @@ typedef enum {
     MPIDI_CH3_PKT_LOCK_OP_ACK,
     MPIDI_CH3_PKT_UNLOCK,
     MPIDI_CH3_PKT_FLUSH,
-    MPIDI_CH3_PKT_FLUSH_ACK,
+    MPIDI_CH3_PKT_ACK,  /* ACK packet for FLUSH, UNLOCK, DECR_AT_COUNTER */
     MPIDI_CH3_PKT_DECR_AT_COUNTER,
     /* RMA Packets end here */
     MPIDI_CH3_PKT_FLOW_CNTL_UPDATE,     /* FIXME: Unused */
@@ -123,7 +123,7 @@ typedef enum {
     MPIDI_CH3_PKT_FLAG_RMA_REQ_ACK = 16,
     MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER = 32,
     MPIDI_CH3_PKT_FLAG_RMA_NOCHECK = 64,
-    MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK = 128,
+    MPIDI_CH3_PKT_FLAG_RMA_ACK = 128,
     MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED = 256,
     MPIDI_CH3_PKT_FLAG_RMA_LOCK_QUEUED_DATA_QUEUED = 512,
     MPIDI_CH3_PKT_FLAG_RMA_LOCK_QUEUED_DATA_DISCARDED = 1024,
@@ -459,7 +459,7 @@ MPIDI_CH3_PKT_DEFS
            packets (PUT, GET, ACC, GACC, CAS, FOP), RMA operation       \
            response packets (GET_RESP, GACC_RESP, CAS_RESP, FOP_RESP),  \
            RMA control packets (LOCK, UNLOCK, FLUSH), and RMA control   \
-           response packets (LOCK_ACK, LOCK_OP_ACK, FLUSH_ACK). */      \
+           response packets (LOCK_ACK, LOCK_OP_ACK, ACK). */            \
         err_ = MPI_SUCCESS;                                             \
         switch((pkt_).type) {                                           \
         case (MPIDI_CH3_PKT_PUT):                                       \
@@ -485,8 +485,8 @@ MPIDI_CH3_PKT_DEFS
         case (MPIDI_CH3_PKT_LOCK_OP_ACK):                               \
             win_hdl_ = (pkt_).lock_op_ack.source_win_handle;            \
             break;                                                      \
-        case (MPIDI_CH3_PKT_FLUSH_ACK):                                 \
-            win_hdl_ = (pkt_).flush_ack.source_win_handle;              \
+        case (MPIDI_CH3_PKT_ACK):                                       \
+            win_hdl_ = (pkt_).ack.source_win_handle;                    \
             break;                                                      \
         default:                                                        \
             MPIU_ERR_SETANDJUMP1(err_, MPI_ERR_OTHER, "**invalidpkt", "**invalidpkt %d", (pkt_).type); \
@@ -798,12 +798,15 @@ typedef struct MPIDI_CH3_Pkt_lock_op_ack {
     int target_rank;
 } MPIDI_CH3_Pkt_lock_op_ack_t;
 
-typedef struct MPIDI_CH3_Pkt_flush_ack {
+/* This ACK packet is the acknowledgement
+ * for FLUSH, UNLOCK and DECR_AT_COUNTER
+ * packet */
+typedef struct MPIDI_CH3_Pkt_ack {
     MPIDI_CH3_Pkt_type_t type;
     MPI_Win source_win_handle;
     int target_rank;
     MPIDI_CH3_Pkt_flags_t flags;
-} MPIDI_CH3_Pkt_flush_ack_t;
+} MPIDI_CH3_Pkt_ack_t;
 
 typedef struct MPIDI_CH3_Pkt_decr_at_counter {
     MPIDI_CH3_Pkt_type_t type;
@@ -846,7 +849,7 @@ typedef union MPIDI_CH3_Pkt {
     MPIDI_CH3_Pkt_lock_op_ack_t lock_op_ack;
     MPIDI_CH3_Pkt_unlock_t unlock;
     MPIDI_CH3_Pkt_flush_t flush;
-    MPIDI_CH3_Pkt_flush_ack_t flush_ack;
+    MPIDI_CH3_Pkt_ack_t ack;
     MPIDI_CH3_Pkt_decr_at_counter_t decr_at_cnt;
     MPIDI_CH3_Pkt_close_t close;
     MPIDI_CH3_Pkt_cas_t cas;
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index a19cb3c..0441565 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -199,27 +199,27 @@ static inline int MPIDI_CH3I_Send_lock_op_ack_pkt(MPIDI_VC_t * vc, MPID_Win * wi
 
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_Send_flush_ack_pkt
+#define FUNCNAME MPIDI_CH3I_Send_ack_pkt
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_Send_flush_ack_pkt(MPIDI_VC_t * vc, MPID_Win * win_ptr,
-                                                MPI_Win source_win_handle)
+static inline int MPIDI_CH3I_Send_ack_pkt(MPIDI_VC_t * vc, MPID_Win * win_ptr,
+                                          MPI_Win source_win_handle)
 {
     MPIDI_CH3_Pkt_t upkt;
-    MPIDI_CH3_Pkt_flush_ack_t *flush_ack_pkt = &upkt.flush_ack;
+    MPIDI_CH3_Pkt_ack_t *ack_pkt = &upkt.ack;
     MPID_Request *req;
     int mpi_errno = MPI_SUCCESS;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SEND_FLUSH_ACK_PKT);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SEND_ACK_PKT);
 
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SEND_FLUSH_ACK_PKT);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SEND_ACK_PKT);
 
-    MPIDI_Pkt_init(flush_ack_pkt, MPIDI_CH3_PKT_FLUSH_ACK);
-    flush_ack_pkt->source_win_handle = source_win_handle;
-    flush_ack_pkt->target_rank = win_ptr->comm_ptr->rank;
+    MPIDI_Pkt_init(ack_pkt, MPIDI_CH3_PKT_ACK);
+    ack_pkt->source_win_handle = source_win_handle;
+    ack_pkt->target_rank = win_ptr->comm_ptr->rank;
 
     /* Because this is in a packet handler, it is already within a critical section */
     /* MPIU_THREAD_CS_ENTER(CH3COMM,vc); */
-    mpi_errno = MPIDI_CH3_iStartMsg(vc, flush_ack_pkt, sizeof(*flush_ack_pkt), &req);
+    mpi_errno = MPIDI_CH3_iStartMsg(vc, ack_pkt, sizeof(*ack_pkt), &req);
     /* MPIU_THREAD_CS_EXIT(CH3COMM,vc); */
     if (mpi_errno != MPI_SUCCESS) {
         MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
@@ -230,7 +230,7 @@ static inline int MPIDI_CH3I_Send_flush_ack_pkt(MPIDI_VC_t * vc, MPID_Win * win_
     }
 
   fn_fail:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SEND_FLUSH_ACK_PKT);
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SEND_ACK_PKT);
     return mpi_errno;
 }
 
@@ -788,10 +788,10 @@ static inline int acquire_local_lock(MPID_Win * win_ptr, int lock_type)
 
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_RMA_Handle_flush_ack
+#define FUNCNAME MPIDI_CH3I_RMA_Handle_ack
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_RMA_Handle_flush_ack(MPID_Win * win_ptr, int target_rank)
+static inline int MPIDI_CH3I_RMA_Handle_ack(MPID_Win * win_ptr, int target_rank)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_RMA_Target_t *t;
@@ -1013,7 +1013,7 @@ static inline int finish_op_on_target(MPID_Win * win_ptr, MPIDI_VC_t * vc,
             flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
             MPIDI_CH3_Pkt_flags_t pkt_flags = MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
             if ((flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) || (flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-                pkt_flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+                pkt_flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
             MPIU_Assert(source_win_handle != MPI_WIN_NULL);
             mpi_errno = MPIDI_CH3I_Send_lock_op_ack_pkt(vc, win_ptr,
                                                         pkt_flags,
@@ -1027,7 +1027,7 @@ static inline int finish_op_on_target(MPID_Win * win_ptr, MPIDI_VC_t * vc,
                   flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE)) {
                 /* If op is piggybacked with both LOCK and FLUSH,
                  * we only send LOCK ACK back, do not send FLUSH ACK. */
-                mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, source_win_handle);
+                mpi_errno = MPIDI_CH3I_Send_ack_pkt(vc, win_ptr, source_win_handle);
                 if (mpi_errno)
                     MPIU_ERR_POP(mpi_errno);
             }
@@ -1045,7 +1045,7 @@ static inline int finish_op_on_target(MPID_Win * win_ptr, MPIDI_VC_t * vc,
                   flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE)) {
                 /* If op is piggybacked with both LOCK and UNLOCK,
                  * we only send LOCK ACK back, do not send FLUSH (UNLOCK) ACK. */
-                mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, source_win_handle);
+                mpi_errno = MPIDI_CH3I_Send_ack_pkt(vc, win_ptr, source_win_handle);
                 if (mpi_errno)
                     MPIU_ERR_POP(mpi_errno);
             }
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_pkt.c b/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
index 919d097..745c39f 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
@@ -595,8 +595,8 @@ int MPIDI_CH3_PktHandler_Init( MPIDI_CH3_PktHandler_Fcn *pktArray[],
         MPIDI_CH3_PktHandler_Unlock;
     pktArray[MPIDI_CH3_PKT_FLUSH] =
         MPIDI_CH3_PktHandler_Flush;
-    pktArray[MPIDI_CH3_PKT_FLUSH_ACK] =
-	MPIDI_CH3_PktHandler_FlushAck;
+    pktArray[MPIDI_CH3_PKT_ACK] =
+	MPIDI_CH3_PktHandler_Ack;
     pktArray[MPIDI_CH3_PKT_DECR_AT_COUNTER] =
         MPIDI_CH3_PktHandler_DecrAtCnt;
     pktArray[MPIDI_CH3_PKT_CAS_IMMED] =
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 9873646..c9d4531 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -293,7 +293,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
         get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
     if ((rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
         (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
 
     /* check if data is contiguous and get true lb */
     MPID_Datatype_is_contig(rreq->dev.datatype, &is_contig);
@@ -481,7 +481,7 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, i
         fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
     if ((rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
         (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
 
     iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) fop_resp_pkt;
     iov[0].MPID_IOV_LEN = sizeof(*fop_resp_pkt);
@@ -845,7 +845,7 @@ int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete(MPIDI_VC_t * vc,
         get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
     if ((rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
         (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
 
     sreq->dev.segment_ptr = MPID_Segment_alloc();
     MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem",
@@ -1153,7 +1153,7 @@ static inline int perform_get_in_lock_queue(MPID_Win * win_ptr,
         get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
     if ((get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
         (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
     get_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
 
     /* length of target data */
@@ -1328,7 +1328,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
             get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
         if ((get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
             (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-            get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+            get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
         get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
 
 
@@ -1434,7 +1434,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
         get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
     if ((get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
         (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
     get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
 
     iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
@@ -1510,7 +1510,7 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr,
         fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
     if ((fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
         (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
 
     if (fop_pkt->type == MPIDI_CH3_PKT_FOP) {
         resp_req = MPID_Request_create();
@@ -1665,7 +1665,7 @@ static inline int perform_cas_in_lock_queue(MPID_Win * win_ptr,
         cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
     if ((cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
         (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-        cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+        cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
 
     /* Copy old value into the response packet */
     MPID_Datatype_get_size_macro(cas_pkt->datatype, len);
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 81c1a0d..2c9cb33 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -21,7 +21,7 @@ MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_lock);
 MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_lock_ack);
 MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_unlock);
 MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_flush);
-MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_flush_ack);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_ack);
 MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_decr_at_cnt);
 
 void MPIDI_CH3_RMA_Init_pkthandler_pvars(void)
@@ -155,14 +155,14 @@ void MPIDI_CH3_RMA_Init_pkthandler_pvars(void)
                                       MPIR_T_PVAR_FLAG_READONLY,
                                       "RMA", "RMA:PKTHANDLER for Flush (in seconds)");
 
-    /* rma_rmapkt_flush_ack */
+    /* rma_rmapkt_ack */
     MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
                                       MPI_DOUBLE,
-                                      rma_rmapkt_flush_ack,
+                                      rma_rmapkt_ack,
                                       MPI_T_VERBOSITY_MPIDEV_DETAIL,
                                       MPI_T_BIND_NO_OBJECT,
                                       MPIR_T_PVAR_FLAG_READONLY,
-                                      "RMA", "RMA:PKTHANDLER for Flush-Ack (in seconds)");
+                                      "RMA", "RMA:PKTHANDLER for Ack (in seconds)");
 
     /* rma_rmapkt_decr_at_cnt */
     MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
@@ -431,7 +431,7 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
         if ((get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
             (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-            get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+            get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
         get_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
 
         /* length of target data */
@@ -890,7 +890,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
         if ((get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
             (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-            get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+            get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
 
         /* NOTE: 'copy data + ACC' needs to be atomic */
 
@@ -1184,7 +1184,7 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
     if ((cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
         (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-        cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+        cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
 
     /* Copy old value into the response packet */
     MPID_Datatype_get_size_macro(cas_pkt->datatype, len);
@@ -1279,8 +1279,8 @@ int MPIDI_CH3_PktHandler_CASResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
     }
-    if (cas_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
-        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+    if (cas_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_ACK) {
+        mpi_errno = MPIDI_CH3I_RMA_Handle_ack(win_ptr, target_rank);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -1353,7 +1353,7 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
         if ((fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
             (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-            fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+            fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_ACK;
 
         /* NOTE: 'copy data + ACC' needs to be atomic */
 
@@ -1528,8 +1528,8 @@ int MPIDI_CH3_PktHandler_FOPResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
     }
-    if (fop_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
-        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+    if (fop_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_ACK) {
+        mpi_errno = MPIDI_CH3I_RMA_Handle_ack(win_ptr, target_rank);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -1615,8 +1615,8 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
     }
-    if (get_accum_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
-        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+    if (get_accum_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_ACK) {
+        mpi_errno = MPIDI_CH3I_RMA_Handle_ack(win_ptr, target_rank);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -1811,8 +1811,8 @@ int MPIDI_CH3_PktHandler_GetResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
     }
-    if (get_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
-        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+    if (get_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_ACK) {
+        mpi_errno = MPIDI_CH3I_RMA_Handle_ack(win_ptr, target_rank);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -1952,9 +1952,9 @@ int MPIDI_CH3_PktHandler_LockOpAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
-    if (flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
+    if (flags & MPIDI_CH3_PKT_FLAG_RMA_ACK) {
         MPIU_Assert(flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED);
-        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+        mpi_errno = MPIDI_CH3I_RMA_Handle_ack(win_ptr, target_rank);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -1970,38 +1970,38 @@ int MPIDI_CH3_PktHandler_LockOpAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 }
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_PktHandler_FlushAck
+#define FUNCNAME MPIDI_CH3_PktHandler_Ack
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_PktHandler_FlushAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
-                                  MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
+int MPIDI_CH3_PktHandler_Ack(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
+                             MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
 {
-    MPIDI_CH3_Pkt_flush_ack_t *flush_ack_pkt = &pkt->flush_ack;
+    MPIDI_CH3_Pkt_ack_t *ack_pkt = &pkt->ack;
     MPID_Win *win_ptr = NULL;
     int mpi_errno = MPI_SUCCESS;
-    int target_rank = flush_ack_pkt->target_rank;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_FLUSHACK);
+    int target_rank = ack_pkt->target_rank;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACK);
 
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_FLUSHACK);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACK);
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received shared lock ops done pkt");
 
-    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_flush_ack);
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_ack);
 
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
 
-    MPID_Win_get_ptr(flush_ack_pkt->source_win_handle, win_ptr);
+    MPID_Win_get_ptr(ack_pkt->source_win_handle, win_ptr);
 
     /* decrement ack_counter on target */
-    mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+    mpi_errno = MPIDI_CH3I_RMA_Handle_ack(win_ptr, target_rank);
     if (mpi_errno)
         MPIU_ERR_POP(mpi_errno);
 
     *rreqp = NULL;
     MPIDI_CH3_Progress_signal_completion();
 
-    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_flush_ack);
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_FLUSHACK);
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_ack);
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACK);
   fn_exit:
     return MPI_SUCCESS;
   fn_fail:
@@ -2035,7 +2035,7 @@ int MPIDI_CH3_PktHandler_DecrAtCnt(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     *rreqp = NULL;
 
     if (decr_at_cnt_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) {
-        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, decr_at_cnt_pkt->source_win_handle);
+        mpi_errno = MPIDI_CH3I_Send_ack_pkt(vc, win_ptr, decr_at_cnt_pkt->source_win_handle);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -2077,7 +2077,7 @@ int MPIDI_CH3_PktHandler_Unlock(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
 
     if (!(unlock_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK)) {
-        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, unlock_pkt->source_win_handle);
+        mpi_errno = MPIDI_CH3I_Send_ack_pkt(vc, win_ptr, unlock_pkt->source_win_handle);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -2117,7 +2117,7 @@ int MPIDI_CH3_PktHandler_Flush(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPID_Win_get_ptr(flush_pkt->target_win_handle, win_ptr);
 
-    mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, flush_pkt->source_win_handle);
+    mpi_errno = MPIDI_CH3I_Send_ack_pkt(vc, win_ptr, flush_pkt->source_win_handle);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
@@ -2200,10 +2200,10 @@ int MPIDI_CH3_PktPrint_Lock(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
     return MPI_SUCCESS;
 }
 
-int MPIDI_CH3_PktPrint_FlushAck(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
+int MPIDI_CH3_PktPrint_Ack(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
 {
-    MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_FLUSH_ACK\n"));
-    MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->lock_accum_unlock.source_win_handle));
+    MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_ACK\n"));
+    MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->ack.source_win_handle));
     return MPI_SUCCESS;
 }
 
diff --git a/src/mpid/ch3/src/mpidi_printf.c b/src/mpid/ch3/src/mpidi_printf.c
index 6e7e17b..da0ddd8 100644
--- a/src/mpid/ch3/src/mpidi_printf.c
+++ b/src/mpid/ch3/src/mpidi_printf.c
@@ -138,8 +138,8 @@ void MPIDI_DBG_Print_packet(MPIDI_CH3_Pkt_t * pkt)
         case MPIDI_CH3_PKT_LOCK:
             MPIDI_CH3_PktPrint_Lock(stdout, pkt);
             break;
-        case MPIDI_CH3_PKT_FLUSH_ACK:
-            MPIDI_CH3_PktPrint_FlushAck(stdout, pkt);
+        case MPIDI_CH3_PKT_ACK:
+            MPIDI_CH3_PktPrint_Ack(stdout, pkt);
             break;
         case MPIDI_CH3_PKT_LOCK_ACK:
             MPIDI_CH3_PktPrint_LockAck(stdout, pkt);
@@ -310,10 +310,9 @@ const char *MPIDI_Pkt_GetDescString(MPIDI_CH3_Pkt_t * pkt)
     case MPIDI_CH3_PKT_LOCK:
         MPIU_Snprintf(pktmsg, sizeof(pktmsg), "LOCK - %d", pkt->lock.target_win_handle);
         break;
-    case MPIDI_CH3_PKT_FLUSH_ACK:
+    case MPIDI_CH3_PKT_ACK:
         /* There is no rma_done packet type */
-        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
-                      "RMA_DONE - 0x%08X", pkt->flush_ack.source_win_handle);
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg), "RMA_DONE - 0x%08X", pkt->ack.source_win_handle);
         break;
     case MPIDI_CH3_PKT_LOCK_ACK:
         MPIU_Snprintf(pktmsg, sizeof(pktmsg), "LOCK_ACK - 0x%08X", pkt->lock_ack.source_win_handle);

http://git.mpich.org/mpich.git/commitdiff/c377f28d4a3b1c99ccc746534f8aa811cd251424

commit c377f28d4a3b1c99ccc746534f8aa811cd251424
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Thu Jun 25 14:30:19 2015 -0500

    Improvement for IBARRIER when number of processes is 1.
    
    When number of processes is only 1, we do not need to schedule
    the current NBC communication but can just return a REQUEST_NULL
    request handle. This patch fixes this issue.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpi/coll/ibarrier.c b/src/mpi/coll/ibarrier.c
index 6b36160..2763e95 100644
--- a/src/mpi/coll/ibarrier.c
+++ b/src/mpi/coll/ibarrier.c
@@ -190,19 +190,21 @@ int MPIR_Ibarrier_impl(MPID_Comm *comm_ptr, MPI_Request *request)
         /* --END USEREXTENSION-- */
     }
 
-    mpi_errno = MPID_Sched_next_tag(comm_ptr, &tag);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-    mpi_errno = MPID_Sched_create(&s);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (comm_ptr->local_size != 1) {
+        mpi_errno = MPID_Sched_next_tag(comm_ptr, &tag);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = MPID_Sched_create(&s);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-    MPIU_Assert(comm_ptr->coll_fns->Ibarrier_sched != NULL);
-    mpi_errno = comm_ptr->coll_fns->Ibarrier_sched(comm_ptr, s);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        MPIU_Assert(comm_ptr->coll_fns->Ibarrier_sched != NULL);
+        mpi_errno = comm_ptr->coll_fns->Ibarrier_sched(comm_ptr, s);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-    mpi_errno = MPID_Sched_start(&s, comm_ptr, tag, &reqp);
-    if (reqp)
-        *request = reqp->handle;
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = MPID_Sched_start(&s, comm_ptr, tag, &reqp);
+        if (reqp)
+            *request = reqp->handle;
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
 
 fn_exit:
     return mpi_errno;
diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index b641d61..471c6a1 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -537,9 +537,15 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
 
-            /* Set window access state properly. */
-            win_ptr->states.access_state = MPIDI_RMA_FENCE_ISSUED;
-            MPIDI_CH3I_num_active_issued_win++;
+            if (win_ptr->fence_sync_req == MPI_REQUEST_NULL) {
+                /* ibarrier completed immediately. */
+                win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
+            }
+            else {
+                /* Set window access state properly. */
+                win_ptr->states.access_state = MPIDI_RMA_FENCE_ISSUED;
+                MPIDI_CH3I_num_active_issued_win++;
+            }
 
             goto finish_fence;
         }
@@ -644,8 +650,15 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
             mpi_errno = MPIR_Ibarrier_impl(win_ptr->comm_ptr, &(win_ptr->fence_sync_req));
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-            MPIDI_CH3I_num_active_issued_win++;
-            win_ptr->states.access_state = MPIDI_RMA_FENCE_ISSUED;
+
+            if (win_ptr->fence_sync_req == MPI_REQUEST_NULL) {
+                /* ibarrier completed immediately. */
+                win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
+            }
+            else {
+                MPIDI_CH3I_num_active_issued_win++;
+                win_ptr->states.access_state = MPIDI_RMA_FENCE_ISSUED;
+            }
 
             if (win_ptr->shm_allocated == TRUE) {
                 MPID_Comm *node_comm_ptr = win_ptr->comm_ptr->node_comm;

http://git.mpich.org/mpich.git/commitdiff/b30fe09ac495f92a9fe6a745db809251a87d608a

commit b30fe09ac495f92a9fe6a745db809251a87d608a
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Thu Jun 25 11:16:39 2015 -0500

    Add an assert in Win_flush_local_all to ensure all requests are completed.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index 86ff6a1..b641d61 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -1800,6 +1800,8 @@ int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
         MPIU_ERR_POP(mpi_errno);
 
   finish_flush_local_all:
+    MPIU_Assert(win_ptr->active_req_cnt == 0);
+
     /* reset upgrade_flush_local flag in target to 0 */
     for (i = 0; i < win_ptr->num_slots; i++) {
         curr_target = win_ptr->slots[i].target_list_head;

http://git.mpich.org/mpich.git/commitdiff/390c449c8fb67f96bca2bdd61b80c8ef6fe01901

commit 390c449c8fb67f96bca2bdd61b80c8ef6fe01901
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Thu Jun 25 11:15:21 2015 -0500

    Add internal flush_local_all and flush_all.
    
    Here we add internal function flush_local_all and flush_all,
    so that Win_fence/Win_complete can just call them internally.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index a7c71df..86ff6a1 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -327,6 +327,145 @@ void MPIDI_CH3_RMA_Init_sync_pvars(void)
 
 #define SYNC_POST_TAG 100
 
+#undef FUNCNAME
+#define FUNCNAME flush_local_all
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int flush_local_all(MPID_Win * win_ptr)
+{
+    int i, made_progress = 0;
+    MPIDI_RMA_Target_t *curr_target = NULL;
+    int local_completed = 0, remote_completed = 0;
+    int total_remote_complete_cnt = 0, total_local_complete_cnt = 0;
+    int curr_remote_complete_cnt = 0, curr_local_complete_cnt = 0;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_FLUSH_LOCAL_ALL);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_FLUSH_LOCAL_ALL);
+
+    /* Set sync_flag in sync struct. */
+    for (i = 0; i < win_ptr->num_slots; i++) {
+        curr_target = win_ptr->slots[i].target_list_head;
+        while (curr_target != NULL) {
+            if (curr_target->sync.upgrade_flush_local) {
+                if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
+                    curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
+                }
+                total_remote_complete_cnt++;
+            }
+            else {
+                if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL) {
+                    curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
+                }
+                total_local_complete_cnt++;
+            }
+
+            curr_target = curr_target->next;
+        }
+    }
+
+    /* issue out all operations. */
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+
+    /* wait for remote completion for those targets that disable flush_local,
+     * and wait for local completion for other targets */
+    do {
+        curr_local_complete_cnt = 0, curr_remote_complete_cnt = 0;
+        for (i = 0; i < win_ptr->num_slots; i++) {
+            curr_target = win_ptr->slots[i].target_list_head;
+            while (curr_target != NULL) {
+                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+
+                MPIDI_CH3I_RMA_ops_completion(win_ptr, curr_target, local_completed,
+                                              remote_completed);
+
+                if (curr_target->sync.upgrade_flush_local) {
+                    if (remote_completed) {
+                        curr_remote_complete_cnt++;
+                    }
+                }
+                else {
+                    if (local_completed) {
+                        curr_local_complete_cnt++;
+                    }
+                }
+                curr_target = curr_target->next;
+            }
+        }
+
+        if (curr_remote_complete_cnt < total_remote_complete_cnt ||
+            curr_local_complete_cnt < total_local_complete_cnt) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
+    } while (curr_remote_complete_cnt < total_remote_complete_cnt ||
+             curr_local_complete_cnt < total_local_complete_cnt);
+
+  fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_FLUSH_LOCAL_ALL);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+  fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+#undef FUNCNAME
+#define FUNCNAME flush_all
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int flush_all(MPID_Win * win_ptr)
+{
+    int i, made_progress = 0;
+    int local_completed = 0, remote_completed = 0;
+    MPIDI_RMA_Target_t *curr_target = NULL;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_FLUSH_ALL);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_FLUSH_ALL);
+
+    /* Set sync_flag in sync struct. */
+    for (i = 0; i < win_ptr->num_slots; i++) {
+        curr_target = win_ptr->slots[i].target_list_head;
+        while (curr_target != NULL) {
+            if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
+                curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
+            }
+
+            curr_target = curr_target->next;
+        }
+    }
+
+    /* Issue out all operations. */
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+
+    /* Wait for remote completion. */
+    do {
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed, &remote_completed);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+        if (!remote_completed) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
+    } while (!remote_completed);
+
+  fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_FLUSH_ALL);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+  fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
 
 /********************************************************************************/
 /* Active Target synchronization (including WIN_FENCE, WIN_POST, WIN_START,     */
@@ -339,8 +478,7 @@ void MPIDI_CH3_RMA_Init_sync_pvars(void)
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
 {
-    int i, made_progress = 0;
-    int local_completed = 0, remote_completed = 0;
+    int i;
     MPIDI_RMA_Target_t *curr_target = NULL;
     mpir_errflag_t errflag = MPIR_ERR_NONE;
     int comm_size = win_ptr->comm_ptr->local_size;
@@ -450,52 +588,26 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
         win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
     }
 
-    /* Set sync_flag in target structs. */
     if (!scalable_fence_enabled) {
         for (i = 0; i < win_ptr->num_slots; i++) {
             curr_target = win_ptr->slots[i].target_list_head;
             while (curr_target != NULL) {
-                if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL) {
-                    curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
-                }
                 /* flag is set in order to decrement complete counter on target */
                 curr_target->win_complete_flag = 1;
 
                 curr_target = curr_target->next;
             }
         }
+
+        mpi_errno = flush_local_all(win_ptr);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
     else {
-        for (i = 0; i < win_ptr->num_slots; i++) {
-            curr_target = win_ptr->slots[i].target_list_head;
-            while (curr_target != NULL) {
-                /* set sync_flag in sync struct */
-                if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
-                    curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
-                }
-                curr_target = curr_target->next;
-            }
-        }
-    }
-
-    /* Issue out all operations. */
-    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
-    if (mpi_errno != MPI_SUCCESS)
-        MPIU_ERR_POP(mpi_errno);
-
-    /* Wait for local/remote completion. */
-    do {
-        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed, &remote_completed);
+        mpi_errno = flush_all(win_ptr);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
-        if ((scalable_fence_enabled && !remote_completed) ||
-            (!scalable_fence_enabled && !local_completed)) {
-            mpi_errno = wait_progress_engine();
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-        }
-    } while ((scalable_fence_enabled && !remote_completed) ||
-             (!scalable_fence_enabled && !local_completed));
+    }
 
     /* Cleanup all targets on window. */
     mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);
@@ -798,10 +910,8 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
     int i, dst, rank = win_ptr->comm_ptr->rank;
-    int local_completed = 0, remote_completed = 0;
     MPID_Comm *win_comm_ptr = win_ptr->comm_ptr;
     MPIDI_RMA_Target_t *curr_target;
-    int made_progress;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_COMPLETE);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_COMPLETE);
@@ -841,10 +951,6 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
         }
 
         if (curr_target != NULL) {
-            /* set sync_flag in sync struct */
-            if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL) {
-                curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
-            }
             curr_target->win_complete_flag = 1;
         }
         else {
@@ -855,23 +961,10 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
         }
     }
 
-    /* issue out all operations */
-    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
+    mpi_errno = flush_local_all(win_ptr);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
-    /* wait until all slots are empty */
-    do {
-        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed, &remote_completed);
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
-        if (!local_completed) {
-            mpi_errno = wait_progress_engine();
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-        }
-    } while (!local_completed);
-
     /* Cleanup all targets on this window. */
     mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);
     if (mpi_errno != MPI_SUCCESS)
@@ -1634,8 +1727,7 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_Win_flush_all(MPID_Win * win_ptr)
 {
-    int i, made_progress = 0;
-    int local_completed = 0, remote_completed = 0;
+    int i;
     MPIDI_RMA_Target_t *curr_target = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPIDI_STATE_MPIDI_WIN_FLUSH_ALL);
@@ -1653,35 +1745,10 @@ int MPIDI_Win_flush_all(MPID_Win * win_ptr)
         OPA_read_write_barrier();
     }
 
-    /* Set sync_flag in sync struct. */
-    for (i = 0; i < win_ptr->num_slots; i++) {
-        curr_target = win_ptr->slots[i].target_list_head;
-        while (curr_target != NULL) {
-            if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
-                curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
-            }
-
-            curr_target = curr_target->next;
-        }
-    }
-
-    /* Issue out all operations. */
-    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
+    mpi_errno = flush_all(win_ptr);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
-    /* Wait for remote completion. */
-    do {
-        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed, &remote_completed);
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
-        if (!remote_completed) {
-            mpi_errno = wait_progress_engine();
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-        }
-    } while (!remote_completed);
-
   finish_flush_all:
     MPIU_Assert(win_ptr->active_req_cnt == 0);
 
@@ -1710,11 +1777,8 @@ int MPIDI_Win_flush_all(MPID_Win * win_ptr)
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
 {
-    int i, made_progress = 0;
-    int local_completed = 0, remote_completed = 0;
+    int i;
     MPIDI_RMA_Target_t *curr_target = NULL;
-    int enable_flush_local_cnt = 0, upgrade_flush_local_cnt = 0;
-    int remote_completed_cnt = 0, local_completed_cnt = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FLUSH_LOCAL_ALL);
 
@@ -1731,70 +1795,10 @@ int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
         OPA_read_write_barrier();
     }
 
-    /* Set sync_flag in sync struct. */
-    for (i = 0; i < win_ptr->num_slots; i++) {
-        curr_target = win_ptr->slots[i].target_list_head;
-        while (curr_target != NULL) {
-            if (curr_target->sync.upgrade_flush_local) {
-                if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
-                    curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
-                }
-                upgrade_flush_local_cnt++;
-            }
-            else {
-                if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL) {
-                    curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
-                }
-                enable_flush_local_cnt++;
-            }
-
-            curr_target = curr_target->next;
-        }
-    }
-
-    /* issue out all operations. */
-    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
+    mpi_errno = flush_local_all(win_ptr);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
-    /* wait for remote completion for those targets that disable flush_local,
-     * and wait for local completion for other targets */
-    do {
-        local_completed_cnt = 0;
-        remote_completed_cnt = 0;
-        for (i = 0; i < win_ptr->num_slots; i++) {
-            curr_target = win_ptr->slots[i].target_list_head;
-            while (curr_target != NULL) {
-                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target);
-                if (mpi_errno != MPI_SUCCESS)
-                    MPIU_ERR_POP(mpi_errno);
-
-                MPIDI_CH3I_RMA_ops_completion(win_ptr, curr_target, local_completed,
-                                              remote_completed);
-
-                if (curr_target->sync.upgrade_flush_local) {
-                    if (remote_completed) {
-                        remote_completed_cnt++;
-                    }
-                }
-                else {
-                    if (local_completed) {
-                        local_completed_cnt++;
-                    }
-                }
-                curr_target = curr_target->next;
-            }
-        }
-
-        if (remote_completed_cnt < upgrade_flush_local_cnt ||
-            local_completed_cnt < enable_flush_local_cnt) {
-            mpi_errno = wait_progress_engine();
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-        }
-    } while (remote_completed_cnt < upgrade_flush_local_cnt ||
-             local_completed_cnt < enable_flush_local_cnt);
-
   finish_flush_local_all:
     /* reset upgrade_flush_local flag in target to 0 */
     for (i = 0; i < win_ptr->num_slots; i++) {

http://git.mpich.org/mpich.git/commitdiff/9ad7292426d5d91b453813f758ac6bd6c4968caf

commit 9ad7292426d5d91b453813f758ac6bd6c4968caf
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Wed Jun 24 11:27:09 2015 -0500

    Modify message of decrementing AT counter to send back FLUSH if necessary.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index 046ca96..d2ca537 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -808,6 +808,8 @@ typedef struct MPIDI_CH3_Pkt_flush_ack {
 typedef struct MPIDI_CH3_Pkt_decr_at_counter {
     MPIDI_CH3_Pkt_type_t type;
     MPI_Win target_win_handle;
+    MPI_Win source_win_handle;
+    MPIDI_CH3_Pkt_flags_t flags;
 } MPIDI_CH3_Pkt_decr_at_counter_t;
 
 typedef struct MPIDI_CH3_Pkt_close {
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index a12c206..a19cb3c 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -239,7 +239,7 @@ static inline int MPIDI_CH3I_Send_flush_ack_pkt(MPIDI_VC_t * vc, MPID_Win * win_
 #define FUNCNAME send_decr_at_cnt_msg
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int send_decr_at_cnt_msg(int dst, MPID_Win * win_ptr)
+static inline int send_decr_at_cnt_msg(int dst, MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags)
 {
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_decr_at_counter_t *decr_at_cnt_pkt = &upkt.decr_at_cnt;
@@ -251,6 +251,8 @@ static inline int send_decr_at_cnt_msg(int dst, MPID_Win * win_ptr)
 
     MPIDI_Pkt_init(decr_at_cnt_pkt, MPIDI_CH3_PKT_DECR_AT_COUNTER);
     decr_at_cnt_pkt->target_win_handle = win_ptr->basic_info_table[dst].win_handle;
+    decr_at_cnt_pkt->source_win_handle = win_ptr->handle;
+    decr_at_cnt_pkt->flags = flags;
 
     MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr, dst, &vc);
 
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index ed0f646..81c1a0d 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -2033,6 +2033,13 @@ int MPIDI_CH3_PktHandler_DecrAtCnt(MPIDI_VC_t * vc ATTRIBUTE((unused)),
 
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
     *rreqp = NULL;
+
+    if (decr_at_cnt_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) {
+        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, decr_at_cnt_pkt->source_win_handle);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+    }
+
     MPIDI_CH3_Progress_signal_completion();
 
   fn_exit:
diff --git a/src/mpid/ch3/src/ch3u_rma_progress.c b/src/mpid/ch3/src/ch3u_rma_progress.c
index ae24769..5e45be7 100644
--- a/src/mpid/ch3/src/ch3u_rma_progress.c
+++ b/src/mpid/ch3/src/ch3u_rma_progress.c
@@ -226,12 +226,23 @@ static inline int check_and_switch_target_state(MPID_Win * win_ptr, MPIDI_RMA_Ta
     case MPIDI_RMA_NONE:
         if (target->win_complete_flag) {
             if (target->pending_op_list_head == NULL) {
-                mpi_errno = send_decr_at_cnt_msg(target->target_rank, win_ptr);
+                MPIDI_CH3_Pkt_flags_t flags = MPIDI_CH3_PKT_FLAG_NONE;
+                if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH && target->put_acc_issued) {
+                    flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
+                    target->sync.outstanding_acks++;
+                }
+
+                mpi_errno = send_decr_at_cnt_msg(target->target_rank, win_ptr, flags);
                 if (mpi_errno != MPI_SUCCESS)
                     MPIU_ERR_POP(mpi_errno);
+
+                /* We are done with ending synchronization, unset target's sync_flag. */
+                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;
+
+                (*made_progress) = 1;
             }
         }
-        if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
+        else if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
             if (target->pending_op_list_head == NULL) {
                 if (target->target_rank != rank) {
                     if (target->put_acc_issued) {
diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index 5ea3ac1..a7c71df 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -849,7 +849,7 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
         }
         else {
             /* FIXME: do we need to wait for remote completion? */
-            mpi_errno = send_decr_at_cnt_msg(dst, win_ptr);
+            mpi_errno = send_decr_at_cnt_msg(dst, win_ptr, MPIDI_CH3_PKT_FLAG_NONE);
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
         }

http://git.mpich.org/mpich.git/commitdiff/638ad7785e4fa969e47223e60f6bb920d6c60467

commit 638ad7785e4fa969e47223e60f6bb920d6c60467
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Thu Jun 25 11:03:54 2015 -0500

    Set upgrade_flush_local to 0 after Win_flush/Win_flush_all.
    
    After Win_flush_local/Win_flush_local_all/Win_flush/Win_flush_all,
    we should set upgrade_flush_local flag back to 0. Originally we
    forgot to do this in Win_flush/Win_flush_all. Here we add them.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index 36eb478..5ea3ac1 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -1289,6 +1289,11 @@ int MPIDI_Win_flush(int dest, MPID_Win * win_ptr)
             MPIU_ERR_POP(mpi_errno);
     }
 
+    if (target != NULL && target->sync.upgrade_flush_local) {
+        /* reset upgrade_flush_local flag in target to 0 */
+        target->sync.upgrade_flush_local = 0;
+    }
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FLUSH);
     return mpi_errno;
@@ -1680,6 +1685,15 @@ int MPIDI_Win_flush_all(MPID_Win * win_ptr)
   finish_flush_all:
     MPIU_Assert(win_ptr->active_req_cnt == 0);
 
+    /* reset upgrade_flush_local flag in target to 0 */
+    for (i = 0; i < win_ptr->num_slots; i++) {
+        curr_target = win_ptr->slots[i].target_list_head;
+        while (curr_target != NULL) {
+            curr_target->sync.upgrade_flush_local = 0;
+            curr_target = curr_target->next;
+        }
+    }
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPIDI_STATE_MPIDI_WIN_FLUSH_ALL);
     return mpi_errno;

http://git.mpich.org/mpich.git/commitdiff/14cf7757372fae40c210f8831909d75b799781b5

commit 14cf7757372fae40c210f8831909d75b799781b5
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Thu Jun 25 10:08:40 2015 -0500

    Poke progress engine in RMA op/sync routines when state is not satisfied.
    
    Here we modify MPIDI_CH3I_RMA_Make_progress_target function and
    MPIDI_CH3I_RMA_Make_progress_win function so that they will poke
    the progress engine once if the current window/target state is not
    satisfied for issuing operations.
    
    Note that MPIDI_CH3I_RMA_Make_progress_target is only called from
    operation routines (MPI_PUT,MPI_GET,...) and MPIDI_CH3I_RMA_Make_progress_win
    is only called from synchronization routines (MPI_WIN_FENCE,
    MPI_WIN_LOCK,...). They cannot be called from the RMA progress engine.
    
    issue_ops_target(), issue_ops_win(), check_and_switch_target_state(),
    check_and_switch_win_state() are core functions, and they are called
    by MPIDI_CH3I_RMA_Make_progress_target(), MPIDI_CH3I_RMA_Make_progress_win()
    and RMA progress engine.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_progress.c b/src/mpid/ch3/src/ch3u_rma_progress.c
index b5cd4bb..ae24769 100644
--- a/src/mpid/ch3/src/ch3u_rma_progress.c
+++ b/src/mpid/ch3/src/ch3u_rma_progress.c
@@ -51,6 +51,7 @@ static inline int check_and_switch_window_state(MPID_Win * win_ptr, int *is_able
                                                 int *made_progress);
 static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
                                    int *made_progress);
+static inline int issue_ops_win(MPID_Win * win_ptr, int *made_progress);
 
 /* check if we can switch window-wide state: FENCE_ISSUED, PSCW_ISSUED, LOCK_ALL_ISSUED */
 #undef FUNCNAME
@@ -305,18 +306,10 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
 {
     MPIDI_RMA_Op_t *curr_op = NULL;
     MPIDI_CH3_Pkt_flags_t flags;
-    int is_able_to_issue = 0;
     int first_op = 1, mpi_errno = MPI_SUCCESS;
 
     (*made_progress) = 0;
 
-    /* check and try to switch target state */
-    mpi_errno = check_and_switch_target_state(win_ptr, target, &is_able_to_issue, made_progress);
-    if (mpi_errno != MPI_SUCCESS)
-        MPIU_ERR_POP(mpi_errno);
-    if (!is_able_to_issue)
-        goto fn_exit;
-
     if (win_ptr->non_empty_slots == 0 || target == NULL || target->pending_op_list_head == NULL)
         goto fn_exit;
 
@@ -503,6 +496,61 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
 
 
 #undef FUNCNAME
+#define FUNCNAME issue_ops_win
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int issue_ops_win(MPID_Win * win_ptr, int *made_progress)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int start_slot, end_slot, i, idx;
+    int is_able_to_issue = 0;
+    int temp_progress = 0;
+    MPIDI_RMA_Target_t *target = NULL;
+
+    (*made_progress) = 0;
+
+    if (win_ptr->non_empty_slots == 0)
+        goto fn_exit;
+
+    /* FIXME: we should optimize the issuing pattern here. */
+
+    start_slot = win_ptr->comm_ptr->rank % win_ptr->num_slots;
+    end_slot = start_slot + win_ptr->num_slots;
+    for (i = start_slot; i < end_slot; i++) {
+        if (i < win_ptr->num_slots)
+            idx = i;
+        else
+            idx = i - win_ptr->num_slots;
+
+        for (target = win_ptr->slots[idx].target_list_head; target != NULL; target = target->next) {
+            /* check and try to switch target state */
+            mpi_errno = check_and_switch_target_state(win_ptr, target, &is_able_to_issue,
+                                                      &temp_progress);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+            if (temp_progress)
+                (*made_progress) = 1;
+            if (!is_able_to_issue) {
+                continue;
+            }
+
+            /* issue operations to this target */
+            mpi_errno = issue_ops_target(win_ptr, target, &temp_progress);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+            if (temp_progress)
+                (*made_progress) = 1;
+        }
+    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
 #define FUNCNAME MPIDI_CH3I_RMA_Free_ops_before_completion
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -779,20 +827,44 @@ int MPIDI_CH3I_RMA_Make_progress_target(MPID_Win * win_ptr, int target_rank, int
 
     (*made_progress) = 0;
 
+    /* NOTE: this function is called from either operation routines (MPI_PUT, MPI_GET...),
+     * or aggressive cleanup functions. It cannot be called from the progress engine.
+     * Here we poke the progress engine if window state is not satisfied (i.e. NBC is not
+     * finished). If it is allowed to be called from progress engine, when RMA progress
+     * is registered / executed before NBC progress, it will cause the progress engine
+     * to re-entrant RMA progress endlessly. */
+
     /* check window state */
     mpi_errno = check_and_switch_window_state(win_ptr, &is_able_to_issue, &temp_progress);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
     if (temp_progress)
         (*made_progress) = 1;
-    if (!is_able_to_issue)
+    if (!is_able_to_issue) {
+        mpi_errno = poke_progress_engine();
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
         goto fn_exit;
+    }
 
     /* find target element */
     mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, target_rank, &target);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
+    /* check and try to switch target state */
+    mpi_errno = check_and_switch_target_state(win_ptr, target, &is_able_to_issue, &temp_progress);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+    if (temp_progress)
+        (*made_progress) = 1;
+    if (!is_able_to_issue) {
+        mpi_errno = poke_progress_engine();
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+        goto fn_exit;
+    }
+
     /* issue operations to this target */
     mpi_errno = issue_ops_target(win_ptr, target, &temp_progress);
     if (mpi_errno)
@@ -814,46 +886,40 @@ int MPIDI_CH3I_RMA_Make_progress_target(MPID_Win * win_ptr, int target_rank, int
 int MPIDI_CH3I_RMA_Make_progress_win(MPID_Win * win_ptr, int *made_progress)
 {
     int mpi_errno = MPI_SUCCESS;
-    int start_slot, end_slot, i, idx;
     int is_able_to_issue = 0;
-    MPIDI_RMA_Target_t *target = NULL;
+    int temp_progress = 0;
 
     (*made_progress) = 0;
 
+    /* NOTE: this function is called from either synchronization routines
+     * (MPI_WIN_FENCE, MPI_WIN_LOCK...), or aggressive cleanup functions.
+     * It cannot be called from the progress engine.
+     * Here we poke the progress engine if window state is not satisfied (i.e. NBC is not
+     * finished). If it is allowed to be called from progress engine, when RMA progress
+     * is registered / executed before NBC progress, it will cause the progress engine
+     * to re-entrant RMA progress endlessly. */
+
     /* check and try to switch window state */
-    mpi_errno = check_and_switch_window_state(win_ptr, &is_able_to_issue, made_progress);
+    mpi_errno = check_and_switch_window_state(win_ptr, &is_able_to_issue, &temp_progress);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
-    if (!is_able_to_issue)
+    if (temp_progress)
+        (*made_progress) = 1;
+    if (!is_able_to_issue) {
+        mpi_errno = poke_progress_engine();
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
         goto fn_exit;
+    }
 
     if (win_ptr->non_empty_slots == 0)
         goto fn_exit;
 
-    /* FIXME: we should optimize the issuing pattern here. */
-
-    start_slot = win_ptr->comm_ptr->rank % win_ptr->num_slots;
-    end_slot = start_slot + win_ptr->num_slots;
-    for (i = start_slot; i < end_slot; i++) {
-        if (i < win_ptr->num_slots)
-            idx = i;
-        else
-            idx = i - win_ptr->num_slots;
-
-        target = win_ptr->slots[idx].target_list_head;
-        while (target != NULL) {
-            int temp_progress = 0;
-
-            /* issue operations to this target */
-            mpi_errno = issue_ops_target(win_ptr, target, &temp_progress);
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-            if (temp_progress)
-                (*made_progress) = 1;
-
-            target = target->next;
-        }
-    }
+    mpi_errno = issue_ops_win(win_ptr, &temp_progress);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+    if (temp_progress)
+        (*made_progress) = 1;
 
   fn_exit:
     return mpi_errno;
@@ -878,13 +944,28 @@ int MPIDI_CH3I_RMA_Make_progress_global(int *made_progress)
 
     for (win_elem = MPIDI_RMA_Win_list; win_elem; win_elem = win_elem->next) {
         int temp_progress = 0;
+        int is_able_to_issue = 0;
 
         if (win_elem->win_ptr->states.access_state == MPIDI_RMA_NONE ||
             win_elem->win_ptr->states.access_state == MPIDI_RMA_FENCE_GRANTED ||
             win_elem->win_ptr->states.access_state == MPIDI_RMA_PSCW_GRANTED)
             continue;
 
-        mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_elem->win_ptr, &temp_progress);
+        /* check and try to switch window state */
+        mpi_errno =
+            check_and_switch_window_state(win_elem->win_ptr, &is_able_to_issue, &temp_progress);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+        if (temp_progress)
+            (*made_progress) = 1;
+        if (!is_able_to_issue) {
+            continue;
+        }
+
+        if (win_elem->win_ptr->non_empty_slots == 0)
+            continue;
+
+        mpi_errno = issue_ops_win(win_elem->win_ptr, &temp_progress);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
         if (temp_progress)

http://git.mpich.org/mpich.git/commitdiff/8a6dab58fe8572176004642d8e52661bd7eaf1a1

commit 8a6dab58fe8572176004642d8e52661bd7eaf1a1
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Thu Jun 11 21:32:03 2015 -0500

    Add progress poking and GC progress during issuing out the ops.
    
    Here we add progress poking and GC progress during issuing out
    operations in order to make progress on receiving incoming
    messages while issuing out messages. Otherwise, if all processes
    are busy issuing out large number of operations, there will be
    no process making progress on receiving and sending progress
    cannot be finished until reaching the ending epoch.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_progress.c b/src/mpid/ch3/src/ch3u_rma_progress.c
index a862961..b5cd4bb 100644
--- a/src/mpid/ch3/src/ch3u_rma_progress.c
+++ b/src/mpid/ch3/src/ch3u_rma_progress.c
@@ -29,6 +29,19 @@ cvars:
          routines to wait until no. of active requests being
          reduced to this value.
 
+    - name        : MPIR_CVAR_CH3_RMA_POKE_PROGRESS_REQ_THRESHOLD
+      category    : CH3
+      type        : int
+      default     : 128
+      class       : none
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_ALL_EQ
+      description : >-
+        Threshold at which the RMA implementation attempts to complete requests
+        while completing RMA operations and while using the lazy synchonization
+        approach.  Change this value if programs fail because they run out of
+        requests or other internal resources
+
 === END_MPI_T_CVAR_INFO_BLOCK ===
 */
 
@@ -460,6 +473,22 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
                 MPIDI_CH3I_RMA_Ops_append(&(target->issued_read_op_list_head),
                                           &(target->issued_read_op_list_tail), curr_op);
             }
+
+
+            /* Poke the progress engine when next_op_to_issue is not the current OP, in
+             * order to make sure the issuing function is re-entrant safe. */
+            if (target->next_op_to_issue != curr_op &&
+                win_ptr->active_req_cnt > MPIR_CVAR_CH3_RMA_POKE_PROGRESS_REQ_THRESHOLD) {
+                int local_completed, remote_completed;
+                mpi_errno = poke_progress_engine();
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+
+                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
+                                                           &remote_completed);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
         }
 
         curr_op = target->next_op_to_issue;

http://git.mpich.org/mpich.git/commitdiff/d6f65c81786a448476069d47ea350d654f51d815

commit d6f65c81786a448476069d47ea350d654f51d815
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Wed Jun 17 21:16:46 2015 -0500

    Better judgement on if it is OK to issue operations.
    
    In check_and_switch_target_state function, we return a
    flag indicating if the state is satified to issue out
    operations. Here the flag should only indicate the current
    state, should not mixed with pending list condition.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_progress.c b/src/mpid/ch3/src/ch3u_rma_progress.c
index 6179410..a862961 100644
--- a/src/mpid/ch3/src/ch3u_rma_progress.c
+++ b/src/mpid/ch3/src/ch3u_rma_progress.c
@@ -272,7 +272,7 @@ static inline int check_and_switch_target_state(MPID_Win * win_ptr, MPIDI_RMA_Ta
         break;
     }   /* end of switch */
 
-    if (target->pending_op_list_head != NULL && target->access_state != MPIDI_RMA_LOCK_ISSUED) {
+    if (target->access_state != MPIDI_RMA_LOCK_ISSUED) {
         (*is_able_to_issue) = 1;
     }
 
@@ -304,7 +304,7 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
     if (!is_able_to_issue)
         goto fn_exit;
 
-    if (win_ptr->non_empty_slots == 0 || target == NULL)
+    if (win_ptr->non_empty_slots == 0 || target == NULL || target->pending_op_list_head == NULL)
         goto fn_exit;
 
     /* Issue out operations in the list. */

http://git.mpich.org/mpich.git/commitdiff/9b1b9241d063901710a465467b36878f284aa53b

commit 9b1b9241d063901710a465467b36878f284aa53b
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Wed Jun 17 11:25:41 2015 -0500

    Add DELAY_ISSUING_FOR_PIGGYBACKING option for RMA synchronization.
    
    Originally in the RMA synchronization, we always try to piggyback
    LOCK/UNLOCK/FLUSH flags with operations by delaying issuing some
    of the operations. This is good when number of operations is very
    small, but delaying issuing not good when message size is large or
    number of operations is large.
    
    In this patch, we add an CVAR to control turn on/off piggybacking
    LOCK/UNLOCK/FLUSH flags. Defaultly it is off, which means we only
    piggyback when there are operations available, but not at the cost
    of delaying issuing operations.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_progress.c b/src/mpid/ch3/src/ch3u_rma_progress.c
index 1b69a20..6179410 100644
--- a/src/mpid/ch3/src/ch3u_rma_progress.c
+++ b/src/mpid/ch3/src/ch3u_rma_progress.c
@@ -201,6 +201,7 @@ static inline int check_and_switch_target_state(MPID_Win * win_ptr, MPIDI_RMA_Ta
                 /* if we reach WIN_UNLOCK and there is still operation existing
                  * in pending list, this operation must be the only operation
                  * and it is prepared to piggyback LOCK and UNLOCK. */
+                MPIU_Assert(MPIR_CVAR_CH3_RMA_DELAY_ISSUING_FOR_PIGGYBACKING);
                 MPIU_Assert(target->pending_op_list_head->next == NULL);
                 MPIU_Assert(target->pending_op_list_head->piggyback_lock_candidate);
             }
@@ -209,6 +210,13 @@ static inline int check_and_switch_target_state(MPID_Win * win_ptr, MPIDI_RMA_Ta
 
     case MPIDI_RMA_LOCK_GRANTED:
     case MPIDI_RMA_NONE:
+        if (target->win_complete_flag) {
+            if (target->pending_op_list_head == NULL) {
+                mpi_errno = send_decr_at_cnt_msg(target->target_rank, win_ptr);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
+        }
         if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
             if (target->pending_op_list_head == NULL) {
                 if (target->target_rank != rank) {
@@ -309,11 +317,12 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
             break;
         }
 
-        if (curr_op->next == NULL &&
+        if (MPIR_CVAR_CH3_RMA_DELAY_ISSUING_FOR_PIGGYBACKING && curr_op->next == NULL &&
             target->sync.sync_flag == MPIDI_RMA_SYNC_NONE && curr_op->ureq == NULL) {
-            /* Skip the last OP if sync_flag is NONE since we
+            /* If DELAY_ISSUING_FOR_PIGGYBACKING is turned on,
+             * skip the last OP if sync_flag is NONE since we
              * want to leave it to the ending synchronization
-             * so that we can piggyback LOCK / FLUSH.
+             * so that we can piggyback UNLOCK / FLUSH.
              * However, if it is a request-based RMA, do not
              * skip it (otherwise a wait call before unlock
              * will be blocked). */
diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index d8ba0b5..36eb478 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -230,6 +230,29 @@ cvars:
           is smaller than the value, FENCE will use a basic but fast
           algorithm which requires an O(P) data structure.
 
+    - name        : MPIR_CVAR_CH3_RMA_DELAY_ISSUING_FOR_PIGGYBACKING
+      category    : CH3
+      type        : int
+      default     : 0
+      class       : none
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_ALL_EQ
+      description : >-
+        Specify if delay issuing of RMA operations for piggybacking
+        LOCK/UNLOCK/FLUSH is enabled. It can be either 0 or 1. When
+        it is set to 1, the issuing of LOCK message is delayed until
+        origin process see the first RMA operation and piggyback
+        LOCK with that operation, and the origin process always keeps
+        the current last operation until the ending synchronization
+        call in order to piggyback UNLOCK/FLUSH with that operation.
+        When it is set to 0, in WIN_LOCK/UNLOCK case, the LOCK message
+        is sent out as early as possible, in WIN_LOCK_ALL/UNLOCK_ALL
+        case, the origin process still tries to piggyback LOCK message
+        with the first operation; for UNLOCK/FLUSH message, the origin
+        process no longer keeps the current last operation but only
+        piggyback UNLOCK/FLUSH if there is an operation avaliable in
+        the ending synchronization call.
+
 === END_MPI_T_CVAR_INFO_BLOCK ===
 */
 
@@ -432,18 +455,12 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
         for (i = 0; i < win_ptr->num_slots; i++) {
             curr_target = win_ptr->slots[i].target_list_head;
             while (curr_target != NULL) {
-                if (curr_target->pending_op_list_head != NULL) {
-                    if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL) {
-                        curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
-                    }
-                    /* flag is set in order to decrement complete counter on target */
-                    curr_target->win_complete_flag = 1;
-                }
-                else {
-                    mpi_errno = send_decr_at_cnt_msg(curr_target->target_rank, win_ptr);
-                    if (mpi_errno != MPI_SUCCESS)
-                        MPIU_ERR_POP(mpi_errno);
+                if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL) {
+                    curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
                 }
+                /* flag is set in order to decrement complete counter on target */
+                curr_target->win_complete_flag = 1;
+
                 curr_target = curr_target->next;
             }
         }
@@ -1046,13 +1063,22 @@ int MPIDI_Win_lock(int lock_type, int dest, int assert, MPID_Win * win_ptr)
 
     /* If Destination is myself or process on SHM, acquire the lock,
      * wait until lock is granted. */
-    if (!(assert & MPI_MODE_NOCHECK) && (dest == rank || shm_target)) {
-        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest, &made_progress);
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
+    if (!(assert & MPI_MODE_NOCHECK)) {
+        if (dest == rank || shm_target) {
+            mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest, &made_progress);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
 
-        while (target->access_state != MPIDI_RMA_LOCK_GRANTED) {
-            mpi_errno = wait_progress_engine();
+            while (target->access_state != MPIDI_RMA_LOCK_GRANTED) {
+                mpi_errno = wait_progress_engine();
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
+        }
+        else if (!MPIR_CVAR_CH3_RMA_DELAY_ISSUING_FOR_PIGGYBACKING) {
+            /* if DELAY_ISSUING_FOR_PIGGYBACKING is turned off, send lock request now
+             * since we do not want to piggyback LOCK with future OP */
+            mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest, &made_progress);
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
         }

http://git.mpich.org/mpich.git/commitdiff/93a25439016411d1bd9a0cf473beeb897a40420c

commit 93a25439016411d1bd9a0cf473beeb897a40420c
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Wed Jun 17 22:19:46 2015 -0500

    use ALL_STREAM_UNITS_ISSUED to indicate if all stream units are issued out.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index cd32c7d..5a0430e 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -548,6 +548,7 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     /* --END ERROR HANDLING-- */
 }
 
+#define ALL_STREAM_UNITS_ISSUED (-1)
 
 /* issue_acc_op() send ACC packet header and data. */
 #undef FUNCNAME
@@ -685,7 +686,7 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     }   /* end of for loop */
 
     /* Mark that all stream units have been issued */
-    rma_op->issued_stream_count = -1;
+    rma_op->issued_stream_count = ALL_STREAM_UNITS_ISSUED;
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_ACC_OP);
@@ -933,7 +934,7 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     }   /* end of for loop */
 
     /* Mark that all stream units have been issued */
-    rma_op->issued_stream_count = -1;
+    rma_op->issued_stream_count = ALL_STREAM_UNITS_ISSUED;
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_GET_ACC_OP);
diff --git a/src/mpid/ch3/src/ch3u_rma_progress.c b/src/mpid/ch3/src/ch3u_rma_progress.c
index c56c658..1b69a20 100644
--- a/src/mpid/ch3/src/ch3u_rma_progress.c
+++ b/src/mpid/ch3/src/ch3u_rma_progress.c
@@ -378,7 +378,8 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
         }
 
         if ((curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
-             curr_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM) && curr_op->issued_stream_count > 0) {
+             curr_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM) &&
+            curr_op->issued_stream_count != ALL_STREAM_UNITS_ISSUED) {
             /* For ACC-like operations, if not all stream units
              * are issued out, we stick to the current operation,
              * otherwise we move on to the next operation. */

http://git.mpich.org/mpich.git/commitdiff/a9849cd112376455d2459ac08d4dcc7dace043f4

commit a9849cd112376455d2459ac08d4dcc7dace043f4
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Jun 23 11:53:43 2015 -0500

    Bug-fix: always allocate target lock entry pool in win_init.
    
    We should always allocate target lock entry pool in win_init,
    even though info no_locks is set to TRUE during window creation,
    this is because that info can be set to FALSE by user after
    the window creation.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index 6fbca30..016f4a3 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -374,19 +374,17 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
         (*win_ptr)->slots[i].target_list_tail = NULL;
     }
 
-    if (!(*win_ptr)->info_args.no_locks) {
-        MPIU_CHKPMEM_MALLOC((*win_ptr)->target_lock_entry_pool_start,
-                            MPIDI_RMA_Target_lock_entry_t *,
-                            sizeof(MPIDI_RMA_Target_lock_entry_t) *
-                            MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE, mpi_errno,
-                            "RMA lock entry pool");
-        (*win_ptr)->target_lock_entry_pool_head = NULL;
-        (*win_ptr)->target_lock_entry_pool_tail = NULL;
-        for (i = 0; i < MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE; i++) {
-            MPL_LL_APPEND((*win_ptr)->target_lock_entry_pool_head,
-                          (*win_ptr)->target_lock_entry_pool_tail,
-                          &((*win_ptr)->target_lock_entry_pool_start[i]));
-        }
+    MPIU_CHKPMEM_MALLOC((*win_ptr)->target_lock_entry_pool_start,
+                        MPIDI_RMA_Target_lock_entry_t *,
+                        sizeof(MPIDI_RMA_Target_lock_entry_t) *
+                        MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE, mpi_errno,
+                        "RMA lock entry pool");
+    (*win_ptr)->target_lock_entry_pool_head = NULL;
+    (*win_ptr)->target_lock_entry_pool_tail = NULL;
+    for (i = 0; i < MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE; i++) {
+        MPL_LL_APPEND((*win_ptr)->target_lock_entry_pool_head,
+                      (*win_ptr)->target_lock_entry_pool_tail,
+                      &((*win_ptr)->target_lock_entry_pool_start[i]));
     }
 
     /* enqueue window into the global list */
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index 35f5d2b..a48f6f7 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -228,9 +228,8 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
     MPIU_Free((*win_ptr)->op_pool_start);
     MPIU_Free((*win_ptr)->target_pool_start);
     MPIU_Free((*win_ptr)->slots);
-    if ((*win_ptr)->target_lock_entry_pool_start != NULL) {
-        MPIU_Free((*win_ptr)->target_lock_entry_pool_start);
-    }
+    MPIU_Free((*win_ptr)->target_lock_entry_pool_start);
+
     MPIU_Assert((*win_ptr)->current_target_lock_data_bytes == 0);
 
     /* Free the attached buffer for windows created with MPI_Win_allocate() */

http://git.mpich.org/mpich.git/commitdiff/10e3c6447fff99259854b93b294fef4ad4a1e731

commit 10e3c6447fff99259854b93b294fef4ad4a1e731
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Jun 21 10:26:41 2015 -0500

    Add prefix "target_" to lock_entry related name and suffix "_head" to linked lists.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_lockqueue.h b/src/mpid/ch3/include/mpid_rma_lockqueue.h
index 3fab2e4..78dda5d 100644
--- a/src/mpid/ch3/include/mpid_rma_lockqueue.h
+++ b/src/mpid/ch3/include/mpid_rma_lockqueue.h
@@ -13,20 +13,23 @@
 MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_lockqueue_alloc);
 MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_winlock_getlocallock);
 
-/* MPIDI_CH3I_Win_lock_entry_alloc(): return a new lock queue entry and
+/* MPIDI_CH3I_Win_target_lock_entry_alloc(): return a new lock queue entry and
  * initialize it. If we cannot get one, return NULL. */
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_Win_lock_entry_alloc
+#define FUNCNAME MPIDI_CH3I_Win_target_lock_entry_alloc
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline MPIDI_RMA_Lock_entry_t *MPIDI_CH3I_Win_lock_entry_alloc(MPID_Win * win_ptr,
-                                                                      MPIDI_CH3_Pkt_t * pkt)
+static inline MPIDI_RMA_Target_lock_entry_t *MPIDI_CH3I_Win_target_lock_entry_alloc(MPID_Win *
+                                                                                    win_ptr,
+                                                                                    MPIDI_CH3_Pkt_t
+                                                                                    * pkt)
 {
-    MPIDI_RMA_Lock_entry_t *new_ptr = NULL;
+    MPIDI_RMA_Target_lock_entry_t *new_ptr = NULL;
 
-    if (win_ptr->lock_entry_pool != NULL) {
-        new_ptr = win_ptr->lock_entry_pool;
-        MPL_LL_DELETE(win_ptr->lock_entry_pool, win_ptr->lock_entry_pool_tail, new_ptr);
+    if (win_ptr->target_lock_entry_pool_head != NULL) {
+        new_ptr = win_ptr->target_lock_entry_pool_head;
+        MPL_LL_DELETE(win_ptr->target_lock_entry_pool_head, win_ptr->target_lock_entry_pool_tail,
+                      new_ptr);
     }
 
     if (new_ptr != NULL) {
@@ -41,25 +44,27 @@ static inline MPIDI_RMA_Lock_entry_t *MPIDI_CH3I_Win_lock_entry_alloc(MPID_Win *
     return new_ptr;
 }
 
-/* MPIDI_CH3I_Win_lock_entry_free(): put a lock queue entry back to
+/* MPIDI_CH3I_Win_target_lock_entry_free(): put a lock queue entry back to
  * the global pool. */
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_Win_lock_entry_free
+#define FUNCNAME MPIDI_CH3I_Win_target_lock_entry_free
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_Win_lock_entry_free(MPID_Win * win_ptr,
-                                                 MPIDI_RMA_Lock_entry_t * lock_entry)
+static inline int MPIDI_CH3I_Win_target_lock_entry_free(MPID_Win * win_ptr,
+                                                        MPIDI_RMA_Target_lock_entry_t *
+                                                        target_lock_entry)
 {
     int mpi_errno = MPI_SUCCESS;
 
-    if (lock_entry->data != NULL) {
-        win_ptr->current_lock_data_bytes -= lock_entry->buf_size;
-        MPIU_Free(lock_entry->data);
+    if (target_lock_entry->data != NULL) {
+        win_ptr->current_target_lock_data_bytes -= target_lock_entry->buf_size;
+        MPIU_Free(target_lock_entry->data);
     }
 
     /* use PREPEND when return objects back to the pool
      * in order to improve cache performance */
-    MPL_LL_PREPEND(win_ptr->lock_entry_pool, win_ptr->lock_entry_pool_tail, lock_entry);
+    MPL_LL_PREPEND(win_ptr->target_lock_entry_pool_head, win_ptr->target_lock_entry_pool_tail,
+                   target_lock_entry);
 
     return mpi_errno;
 }
diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 95362f6..5737a98 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -16,8 +16,8 @@ int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Targe
 int MPIDI_CH3I_RMA_Make_progress_target(MPID_Win * win_ptr, int target_rank, int *made_progress);
 int MPIDI_CH3I_RMA_Make_progress_win(MPID_Win * win_ptr, int *made_progress);
 
-extern MPIDI_RMA_Op_t *global_rma_op_pool, *global_rma_op_pool_tail, *global_rma_op_pool_start;
-extern MPIDI_RMA_Target_t *global_rma_target_pool, *global_rma_target_pool_tail,
+extern MPIDI_RMA_Op_t *global_rma_op_pool_head, *global_rma_op_pool_tail, *global_rma_op_pool_start;
+extern MPIDI_RMA_Target_t *global_rma_target_pool_head, *global_rma_target_pool_tail,
     *global_rma_target_pool_start;
 
 MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_alloc);
@@ -52,18 +52,18 @@ static inline MPIDI_RMA_Op_t *MPIDI_CH3I_Win_op_alloc(MPID_Win * win_ptr)
 {
     MPIDI_RMA_Op_t *e;
 
-    if (win_ptr->op_pool == NULL) {
+    if (win_ptr->op_pool_head == NULL) {
         /* local pool is empty, try to find something in the global pool */
-        if (global_rma_op_pool == NULL)
+        if (global_rma_op_pool_head == NULL)
             return NULL;
         else {
-            e = global_rma_op_pool;
-            MPL_LL_DELETE(global_rma_op_pool, global_rma_op_pool_tail, e);
+            e = global_rma_op_pool_head;
+            MPL_LL_DELETE(global_rma_op_pool_head, global_rma_op_pool_tail, e);
         }
     }
     else {
-        e = win_ptr->op_pool;
-        MPL_LL_DELETE(win_ptr->op_pool, win_ptr->op_pool_tail, e);
+        e = win_ptr->op_pool_head;
+        MPL_LL_DELETE(win_ptr->op_pool_head, win_ptr->op_pool_tail, e);
     }
 
     e->dataloop = NULL;
@@ -101,9 +101,9 @@ static inline int MPIDI_CH3I_Win_op_free(MPID_Win * win_ptr, MPIDI_RMA_Op_t * e)
     /* use PREPEND when return objects back to the pool
      * in order to improve cache performance */
     if (e->pool_type == MPIDI_RMA_POOL_WIN)
-        MPL_LL_PREPEND(win_ptr->op_pool, win_ptr->op_pool_tail, e);
+        MPL_LL_PREPEND(win_ptr->op_pool_head, win_ptr->op_pool_tail, e);
     else
-        MPL_LL_PREPEND(global_rma_op_pool, global_rma_op_pool_tail, e);
+        MPL_LL_PREPEND(global_rma_op_pool_head, global_rma_op_pool_tail, e);
 
     return mpi_errno;
 }
@@ -118,18 +118,18 @@ static inline MPIDI_RMA_Target_t *MPIDI_CH3I_Win_target_alloc(MPID_Win * win_ptr
 {
     MPIDI_RMA_Target_t *e;
 
-    if (win_ptr->target_pool == NULL) {
+    if (win_ptr->target_pool_head == NULL) {
         /* local pool is empty, try to find something in the global pool */
-        if (global_rma_target_pool == NULL)
+        if (global_rma_target_pool_head == NULL)
             return NULL;
         else {
-            e = global_rma_target_pool;
-            MPL_LL_DELETE(global_rma_target_pool, global_rma_target_pool_tail, e);
+            e = global_rma_target_pool_head;
+            MPL_LL_DELETE(global_rma_target_pool_head, global_rma_target_pool_tail, e);
         }
     }
     else {
-        e = win_ptr->target_pool;
-        MPL_LL_DELETE(win_ptr->target_pool, win_ptr->target_pool_tail, e);
+        e = win_ptr->target_pool_head;
+        MPL_LL_DELETE(win_ptr->target_pool_head, win_ptr->target_pool_tail, e);
     }
 
     e->issued_read_op_list_head = e->issued_read_op_list_tail = NULL;
@@ -173,9 +173,9 @@ static inline int MPIDI_CH3I_Win_target_free(MPID_Win * win_ptr, MPIDI_RMA_Targe
     /* use PREPEND when return objects back to the pool
      * in order to improve cache performance */
     if (e->pool_type == MPIDI_RMA_POOL_WIN)
-        MPL_LL_PREPEND(win_ptr->target_pool, win_ptr->target_pool_tail, e);
+        MPL_LL_PREPEND(win_ptr->target_pool_head, win_ptr->target_pool_tail, e);
     else
-        MPL_LL_PREPEND(global_rma_target_pool, global_rma_target_pool_tail, e);
+        MPL_LL_PREPEND(global_rma_target_pool_head, global_rma_target_pool_tail, e);
 
     return mpi_errno;
 }
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index b3e1b30..f2ba4c3 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -129,14 +129,14 @@ typedef struct MPIDI_RMA_Win_list {
 
 extern MPIDI_RMA_Win_list_t *MPIDI_RMA_Win_list, *MPIDI_RMA_Win_list_tail;
 
-typedef struct MPIDI_RMA_Lock_entry {
-    struct MPIDI_RMA_Lock_entry *next;
+typedef struct MPIDI_RMA_Target_lock_entry {
+    struct MPIDI_RMA_Target_lock_entry *next;
     MPIDI_CH3_Pkt_t pkt;        /* all information for this request packet */
     MPIDI_VC_t *vc;
     void *data;                 /* for queued PUTs / ACCs / GACCs, data is copied here */
     int buf_size;
     int all_data_recved;        /* indicate if all data has been received */
-} MPIDI_RMA_Lock_entry_t;
+} MPIDI_RMA_Target_lock_entry_t;
 
 typedef MPIDI_RMA_Op_t *MPIDI_RMA_Ops_list_t;
 
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index 2fd203b..9f57a89 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -308,17 +308,17 @@ typedef struct MPIDI_Win_basic_info {
     volatile int current_lock_type;   /* current lock type on this window (as target)   \
                               * (none, shared, exclusive) */             \
     volatile int shared_lock_ref_cnt;                                    \
-    struct MPIDI_RMA_Lock_entry volatile *lock_queue;  /* list of unsatisfied locks */  \
-    struct MPIDI_RMA_Lock_entry volatile *lock_queue_tail; /* tail of unstaisfied locks. */ \
+    struct MPIDI_RMA_Target_lock_entry volatile *target_lock_queue_head;  /* list of unsatisfied locks */  \
+    struct MPIDI_RMA_Target_lock_entry volatile *target_lock_queue_tail; /* tail of unstaisfied locks. */ \
                                                                          \
     struct MPIDI_Win_info_args info_args;                                \
     int shm_allocated; /* flag: TRUE iff this window has a shared memory \
                           region associated with it */                   \
     struct MPIDI_RMA_Op *op_pool_start; /* start pointer used for freeing */\
-    struct MPIDI_RMA_Op *op_pool;  /* pool of operations */              \
+    struct MPIDI_RMA_Op *op_pool_head;  /* pool of operations */              \
     struct MPIDI_RMA_Op *op_pool_tail; /* tail pointer to pool of operations. */ \
     struct MPIDI_RMA_Target *target_pool_start; /* start pointer used for freeing */\
-    struct MPIDI_RMA_Target *target_pool; /* pool of targets */          \
+    struct MPIDI_RMA_Target *target_pool_head; /* pool of targets */          \
     struct MPIDI_RMA_Target *target_pool_tail; /* tail pointer to pool of targets */\
     struct MPIDI_RMA_Slot *slots;                                        \
     int num_slots;                                                       \
@@ -339,10 +339,10 @@ typedef struct MPIDI_Win_basic_info {
     int outstanding_locks; /* when issuing multiple lock requests in     \
                             MPI_WIN_LOCK_ALL, this counter keeps track   \
                             of number of locks not being granted yet. */ \
-    struct MPIDI_RMA_Lock_entry *lock_entry_pool_start;                  \
-    struct MPIDI_RMA_Lock_entry *lock_entry_pool;                        \
-    struct MPIDI_RMA_Lock_entry *lock_entry_pool_tail;                   \
-    int current_lock_data_bytes;                                         \
+    struct MPIDI_RMA_Target_lock_entry *target_lock_entry_pool_start;   \
+    struct MPIDI_RMA_Target_lock_entry *target_lock_entry_pool_head;    \
+    struct MPIDI_RMA_Target_lock_entry *target_lock_entry_pool_tail;    \
+    int current_target_lock_data_bytes;                                 \
 
 #ifdef MPIDI_CH3_WIN_DECL
 #define MPID_DEV_WIN_DECL \
@@ -433,7 +433,7 @@ typedef struct MPIDI_Request {
     MPI_Win     target_win_handle;
     MPI_Win     source_win_handle;
     MPIDI_CH3_Pkt_flags_t flags; /* flags that were included in the original RMA packet header */
-    struct MPIDI_RMA_Lock_entry *lock_queue_entry;
+    struct MPIDI_RMA_Target_lock_entry *target_lock_queue_entry;
     MPI_Request resp_request_handle; /* Handle for get_accumulate response */
 
     void *ext_hdr_ptr; /* pointer to extended packet header */
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index 09fe27d..a12c206 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -320,7 +320,7 @@ static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
                                       MPIDI_CH3_Pkt_t * pkt,
                                       MPIDI_msg_sz_t * buflen, MPID_Request ** reqp)
 {
-    MPIDI_RMA_Lock_entry_t *new_ptr = NULL;
+    MPIDI_RMA_Target_lock_entry_t *new_ptr = NULL;
     MPIDI_CH3_Pkt_flags_t flag;
     MPI_Win source_win_handle;
     MPI_Request request_handle;
@@ -329,9 +329,9 @@ static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
 
     (*reqp) = NULL;
 
-    new_ptr = MPIDI_CH3I_Win_lock_entry_alloc(win_ptr, pkt);
+    new_ptr = MPIDI_CH3I_Win_target_lock_entry_alloc(win_ptr, pkt);
     if (new_ptr != NULL) {
-        MPL_LL_APPEND(win_ptr->lock_queue, win_ptr->lock_queue_tail, new_ptr);
+        MPL_LL_APPEND(win_ptr->target_lock_queue_head, win_ptr->target_lock_queue_tail, new_ptr);
         new_ptr->vc = vc;
     }
     else {
@@ -408,13 +408,14 @@ static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
         }
 
         if (new_ptr != NULL) {
-            if (win_ptr->current_lock_data_bytes + buf_size < MPIR_CVAR_CH3_RMA_LOCK_DATA_BYTES) {
+            if (win_ptr->current_target_lock_data_bytes + buf_size <
+                MPIR_CVAR_CH3_RMA_TARGET_LOCK_DATA_BYTES) {
                 new_ptr->data = MPIU_Malloc(buf_size);
             }
 
             if (new_ptr->data == NULL) {
                 /* Note that there are two possible reasons to make new_ptr->data to be NULL:
-                 * (1) win_ptr->current_lock_data_bytes + buf_size >= MPIR_CVAR_CH3_RMA_LOCK_DATA_BYTES;
+                 * (1) win_ptr->current_target_lock_data_bytes + buf_size >= MPIR_CVAR_CH3_RMA_TARGET_LOCK_DATA_BYTES;
                  * (2) MPIU_Malloc(buf_size) failed.
                  * In such cases, we cannot allocate memory for lock data, so we give up
                  * buffering lock data, however, we still buffer lock request.
@@ -447,7 +448,7 @@ static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
                 data_discarded = 1;
             }
             else {
-                win_ptr->current_lock_data_bytes += buf_size;
+                win_ptr->current_target_lock_data_bytes += buf_size;
                 new_ptr->buf_size = buf_size;
             }
         }
@@ -465,7 +466,7 @@ static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
             req->dev.recv_data_sz = recv_data_sz;
             req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete;
             req->dev.OnFinal = MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete;
-            req->dev.lock_queue_entry = new_ptr;
+            req->dev.target_lock_queue_entry = new_ptr;
 
             data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
             data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
@@ -478,7 +479,7 @@ static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
             req->dev.recv_data_sz = recv_data_sz;
             req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete;
             req->dev.OnFinal = MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete;
-            req->dev.lock_queue_entry = new_ptr;
+            req->dev.target_lock_queue_entry = new_ptr;
 
             data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
             data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
@@ -746,7 +747,7 @@ static inline int acquire_local_lock(MPID_Win * win_ptr, int lock_type)
         /* Queue the lock information. */
         MPIDI_CH3_Pkt_t pkt;
         MPIDI_CH3_Pkt_lock_t *lock_pkt = &pkt.lock;
-        MPIDI_RMA_Lock_entry_t *new_ptr = NULL;
+        MPIDI_RMA_Target_lock_entry_t *new_ptr = NULL;
         MPIDI_VC_t *my_vc;
 
         MPIDI_Pkt_init(lock_pkt, MPIDI_CH3_PKT_LOCK);
@@ -758,7 +759,7 @@ static inline int acquire_local_lock(MPID_Win * win_ptr, int lock_type)
             lock_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE;
         }
 
-        new_ptr = MPIDI_CH3I_Win_lock_entry_alloc(win_ptr, &pkt);
+        new_ptr = MPIDI_CH3I_Win_target_lock_entry_alloc(win_ptr, &pkt);
         if (new_ptr == NULL) {
             mpi_errno = handle_lock_ack(win_ptr, win_ptr->comm_ptr->rank,
                                         MPIDI_CH3_PKT_FLAG_RMA_LOCK_DISCARDED);
@@ -766,7 +767,7 @@ static inline int acquire_local_lock(MPID_Win * win_ptr, int lock_type)
                 MPIU_ERR_POP(mpi_errno);
             goto fn_exit;
         }
-        MPL_LL_APPEND(win_ptr->lock_queue, win_ptr->lock_queue_tail, new_ptr);
+        MPL_LL_APPEND(win_ptr->target_lock_queue_head, win_ptr->target_lock_queue_tail, new_ptr);
         MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &my_vc);
         new_ptr->vc = my_vc;
 
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 2f79b48..9873646 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -1061,16 +1061,17 @@ static int create_derived_datatype(MPID_Request * req, MPID_Datatype ** dtp)
 }
 
 
-static inline int perform_put_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_entry_t * lock_entry)
+static inline int perform_put_in_lock_queue(MPID_Win * win_ptr,
+                                            MPIDI_RMA_Target_lock_entry_t * target_lock_entry)
 {
-    MPIDI_CH3_Pkt_put_t *put_pkt = &((lock_entry->pkt).put);
+    MPIDI_CH3_Pkt_put_t *put_pkt = &((target_lock_entry->pkt).put);
     int mpi_errno = MPI_SUCCESS;
 
     /* Piggyback candidate should have basic datatype for target datatype. */
     MPIU_Assert(MPIR_DATATYPE_IS_PREDEFINED(put_pkt->datatype));
 
     /* Make sure that all data is received for this op. */
-    MPIU_Assert(lock_entry->all_data_recved == 1);
+    MPIU_Assert(target_lock_entry->all_data_recved == 1);
 
     if (put_pkt->type == MPIDI_CH3_PKT_PUT_IMMED) {
         /* all data fits in packet header */
@@ -1082,15 +1083,16 @@ static inline int perform_put_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     else {
         MPIU_Assert(put_pkt->type == MPIDI_CH3_PKT_PUT);
 
-        mpi_errno = MPIR_Localcopy(lock_entry->data, put_pkt->count, put_pkt->datatype,
+        mpi_errno = MPIR_Localcopy(target_lock_entry->data, put_pkt->count, put_pkt->datatype,
                                    put_pkt->addr, put_pkt->count, put_pkt->datatype);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
 
     /* do final action */
-    mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, FALSE /* has no response data */ ,
-                                    put_pkt->flags, put_pkt->source_win_handle);
+    mpi_errno =
+        finish_op_on_target(win_ptr, target_lock_entry->vc, FALSE /* has no response data */ ,
+                            put_pkt->flags, put_pkt->source_win_handle);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
@@ -1100,11 +1102,12 @@ static inline int perform_put_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     goto fn_exit;
 }
 
-static inline int perform_get_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_entry_t * lock_entry)
+static inline int perform_get_in_lock_queue(MPID_Win * win_ptr,
+                                            MPIDI_RMA_Target_lock_entry_t * target_lock_entry)
 {
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_get_resp_t *get_resp_pkt = &upkt.get_resp;
-    MPIDI_CH3_Pkt_get_t *get_pkt = &((lock_entry->pkt).get);
+    MPIDI_CH3_Pkt_get_t *get_pkt = &((target_lock_entry->pkt).get);
     MPID_Request *sreq = NULL;
     MPI_Aint type_size;
     size_t len;
@@ -1117,7 +1120,7 @@ static inline int perform_get_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     MPIU_Assert(MPIR_DATATYPE_IS_PREDEFINED(get_pkt->datatype));
 
     /* Make sure that all data is received for this op. */
-    MPIU_Assert(lock_entry->all_data_recved == 1);
+    MPIU_Assert(target_lock_entry->all_data_recved == 1);
 
     sreq = MPID_Request_create();
     if (sreq == NULL) {
@@ -1170,7 +1173,7 @@ static inline int perform_get_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
         iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
         iovcnt = 1;
 
-        mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, sreq, iov, iovcnt);
+        mpi_errno = MPIDI_CH3_iSendv(target_lock_entry->vc, sreq, iov, iovcnt);
         if (mpi_errno != MPI_SUCCESS) {
             MPID_Request_release(sreq);
             MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
@@ -1183,7 +1186,7 @@ static inline int perform_get_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
         iov[1].MPID_IOV_LEN = get_pkt->count * type_size;
         iovcnt = 2;
 
-        mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, sreq, iov, iovcnt);
+        mpi_errno = MPIDI_CH3_iSendv(target_lock_entry->vc, sreq, iov, iovcnt);
         if (mpi_errno != MPI_SUCCESS) {
             MPID_Request_release(sreq);
             MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
@@ -1202,8 +1205,9 @@ static inline int perform_get_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
         sreq->dev.segment_first = 0;
         sreq->dev.segment_size = get_pkt->count * type_size;
 
-        mpi_errno = lock_entry->vc->sendNoncontig_fn(lock_entry->vc, sreq,
-                                                     iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
+        mpi_errno = target_lock_entry->vc->sendNoncontig_fn(target_lock_entry->vc, sreq,
+                                                            iov[0].MPID_IOV_BUF,
+                                                            iov[0].MPID_IOV_LEN);
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
 
@@ -1214,12 +1218,13 @@ static inline int perform_get_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
 }
 
 
-static inline int perform_acc_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_entry_t * lock_entry)
+static inline int perform_acc_in_lock_queue(MPID_Win * win_ptr,
+                                            MPIDI_RMA_Target_lock_entry_t * target_lock_entry)
 {
-    MPIDI_CH3_Pkt_accum_t *acc_pkt = &((lock_entry->pkt).accum);
+    MPIDI_CH3_Pkt_accum_t *acc_pkt = &((target_lock_entry->pkt).accum);
     int mpi_errno = MPI_SUCCESS;
 
-    MPIU_Assert(lock_entry->all_data_recved == 1);
+    MPIU_Assert(target_lock_entry->all_data_recved == 1);
 
     /* Piggyback candidate should have basic datatype for target datatype. */
     MPIU_Assert(MPIR_DATATYPE_IS_PREDEFINED(acc_pkt->datatype));
@@ -1248,7 +1253,7 @@ static inline int perform_acc_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
         /* Note: here stream_offset is 0 because when piggybacking LOCK, we must use
          * the first stream unit. */
         MPIU_Assert(recv_count = (int) recv_count);
-        mpi_errno = do_accumulate_op(lock_entry->data, (int) recv_count, acc_pkt->datatype,
+        mpi_errno = do_accumulate_op(target_lock_entry->data, (int) recv_count, acc_pkt->datatype,
                                      acc_pkt->addr, acc_pkt->count, acc_pkt->datatype,
                                      0, acc_pkt->op);
     }
@@ -1259,8 +1264,9 @@ static inline int perform_acc_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
-    mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, FALSE /* has no response data */ ,
-                                    acc_pkt->flags, acc_pkt->source_win_handle);
+    mpi_errno =
+        finish_op_on_target(win_ptr, target_lock_entry->vc, FALSE /* has no response data */ ,
+                            acc_pkt->flags, acc_pkt->source_win_handle);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
@@ -1272,11 +1278,11 @@ static inline int perform_acc_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
 
 
 static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
-                                                MPIDI_RMA_Lock_entry_t * lock_entry)
+                                                MPIDI_RMA_Target_lock_entry_t * target_lock_entry)
 {
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_get_accum_resp_t *get_accum_resp_pkt = &upkt.get_accum_resp;
-    MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &((lock_entry->pkt).get_accum);
+    MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &((target_lock_entry->pkt).get_accum);
     MPID_Request *sreq = NULL;
     MPI_Aint type_size;
     size_t len;
@@ -1291,7 +1297,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
     MPIU_Assert(MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype));
 
     /* Make sure that all data is received for this op. */
-    MPIU_Assert(lock_entry->all_data_recved == 1);
+    MPIU_Assert(target_lock_entry->all_data_recved == 1);
 
     sreq = MPID_Request_create();
     if (sreq == NULL) {
@@ -1351,7 +1357,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
         iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
         iovcnt = 1;
 
-        mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, sreq, iov, iovcnt);
+        mpi_errno = MPIDI_CH3_iSendv(target_lock_entry->vc, sreq, iov, iovcnt);
         if (mpi_errno != MPI_SUCCESS) {
             MPID_Request_release(sreq);
             MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
@@ -1437,7 +1443,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
     iov[1].MPID_IOV_LEN = recv_count * type_size;
     iovcnt = 2;
 
-    mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, sreq, iov, iovcnt);
+    mpi_errno = MPIDI_CH3_iSendv(target_lock_entry->vc, sreq, iov, iovcnt);
     if (mpi_errno != MPI_SUCCESS) {
         MPID_Request_release(sreq);
         MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
@@ -1446,7 +1452,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
     /* Perform ACCUMULATE OP */
 
     MPIU_Assert(recv_count == (int) recv_count);
-    mpi_errno = do_accumulate_op(lock_entry->data, (int) recv_count, get_accum_pkt->datatype,
+    mpi_errno = do_accumulate_op(target_lock_entry->data, (int) recv_count, get_accum_pkt->datatype,
                                  get_accum_pkt->addr, get_accum_pkt->count, get_accum_pkt->datatype,
                                  0, get_accum_pkt->op);
 
@@ -1463,11 +1469,12 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
 }
 
 
-static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_entry_t * lock_entry)
+static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr,
+                                            MPIDI_RMA_Target_lock_entry_t * target_lock_entry)
 {
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_fop_resp_t *fop_resp_pkt = &upkt.fop_resp;
-    MPIDI_CH3_Pkt_fop_t *fop_pkt = &((lock_entry->pkt).fop);
+    MPIDI_CH3_Pkt_fop_t *fop_pkt = &((target_lock_entry->pkt).fop);
     MPID_Request *resp_req = NULL;
     MPI_Aint type_size;
     MPID_IOV iov[MPID_IOV_LIMIT];
@@ -1479,7 +1486,7 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     MPIU_Assert(MPIR_DATATYPE_IS_PREDEFINED(fop_pkt->datatype));
 
     /* Make sure that all data is received for this op. */
-    MPIU_Assert(lock_entry->all_data_recved == 1);
+    MPIU_Assert(target_lock_entry->all_data_recved == 1);
 
     /* FIXME: this function is same with PktHandler_FOP(), should
      * do code refactoring on both of them. */
@@ -1562,10 +1569,11 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
 
     if (fop_pkt->type == MPIDI_CH3_PKT_FOP_IMMED) {
         /* send back the original data */
-        MPIU_THREAD_CS_ENTER(CH3COMM, lock_entry->vc);
+        MPIU_THREAD_CS_ENTER(CH3COMM, target_lock_entry->vc);
         mpi_errno =
-            MPIDI_CH3_iStartMsg(lock_entry->vc, fop_resp_pkt, sizeof(*fop_resp_pkt), &resp_req);
-        MPIU_THREAD_CS_EXIT(CH3COMM, lock_entry->vc);
+            MPIDI_CH3_iStartMsg(target_lock_entry->vc, fop_resp_pkt, sizeof(*fop_resp_pkt),
+                                &resp_req);
+        MPIU_THREAD_CS_EXIT(CH3COMM, target_lock_entry->vc);
         MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
 
         if (resp_req != NULL) {
@@ -1595,7 +1603,7 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
         iov[1].MPID_IOV_LEN = type_size;
         iovcnt = 2;
 
-        mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, resp_req, iov, iovcnt);
+        mpi_errno = MPIDI_CH3_iSendv(target_lock_entry->vc, resp_req, iov, iovcnt);
         if (mpi_errno != MPI_SUCCESS) {
             MPID_Request_release(resp_req);
             MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
@@ -1609,7 +1617,7 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
                                      fop_pkt->addr, 1, fop_pkt->datatype, 0, fop_pkt->op);
     }
     else {
-        mpi_errno = do_accumulate_op(lock_entry->data, 1, fop_pkt->datatype,
+        mpi_errno = do_accumulate_op(target_lock_entry->data, 1, fop_pkt->datatype,
                                      fop_pkt->addr, 1, fop_pkt->datatype, 0, fop_pkt->op);
     }
 
@@ -1620,7 +1628,7 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
         MPIU_ERR_POP(mpi_errno);
 
     /* do final action */
-    mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, TRUE /* has response data */ ,
+    mpi_errno = finish_op_on_target(win_ptr, target_lock_entry->vc, TRUE /* has response data */ ,
                                     fop_pkt->flags, MPI_WIN_NULL);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
@@ -1632,11 +1640,12 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
 }
 
 
-static inline int perform_cas_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_entry_t * lock_entry)
+static inline int perform_cas_in_lock_queue(MPID_Win * win_ptr,
+                                            MPIDI_RMA_Target_lock_entry_t * target_lock_entry)
 {
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_cas_resp_t *cas_resp_pkt = &upkt.cas_resp;
-    MPIDI_CH3_Pkt_cas_t *cas_pkt = &((lock_entry->pkt).cas);
+    MPIDI_CH3_Pkt_cas_t *cas_pkt = &((target_lock_entry->pkt).cas);
     MPID_Request *send_req = NULL;
     MPI_Aint len;
     int mpi_errno = MPI_SUCCESS;
@@ -1645,7 +1654,7 @@ static inline int perform_cas_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     MPIU_Assert(MPIR_DATATYPE_IS_PREDEFINED(cas_pkt->datatype));
 
     /* Make sure that all data is received for this op. */
-    MPIU_Assert(lock_entry->all_data_recved == 1);
+    MPIU_Assert(target_lock_entry->all_data_recved == 1);
 
     MPIDI_Pkt_init(cas_resp_pkt, MPIDI_CH3_PKT_CAS_RESP_IMMED);
     cas_resp_pkt->request_handle = cas_pkt->request_handle;
@@ -1668,9 +1677,10 @@ static inline int perform_cas_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     MPIU_Memcpy((void *) &cas_resp_pkt->info.data, cas_pkt->addr, len);
 
     /* Send the response packet */
-    MPIU_THREAD_CS_ENTER(CH3COMM, lock_entry->vc);
-    mpi_errno = MPIDI_CH3_iStartMsg(lock_entry->vc, cas_resp_pkt, sizeof(*cas_resp_pkt), &send_req);
-    MPIU_THREAD_CS_EXIT(CH3COMM, lock_entry->vc);
+    MPIU_THREAD_CS_ENTER(CH3COMM, target_lock_entry->vc);
+    mpi_errno =
+        MPIDI_CH3_iStartMsg(target_lock_entry->vc, cas_resp_pkt, sizeof(*cas_resp_pkt), &send_req);
+    MPIU_THREAD_CS_EXIT(CH3COMM, target_lock_entry->vc);
 
     MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
 
@@ -1702,7 +1712,7 @@ static inline int perform_cas_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     }
 
     /* do final action */
-    mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, TRUE /* has response data */ ,
+    mpi_errno = finish_op_on_target(win_ptr, target_lock_entry->vc, TRUE /* has response data */ ,
                                     cas_pkt->flags, MPI_WIN_NULL);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
@@ -1714,27 +1724,28 @@ static inline int perform_cas_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
 }
 
 
-static inline int perform_op_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_entry_t * lock_entry)
+static inline int perform_op_in_lock_queue(MPID_Win * win_ptr,
+                                           MPIDI_RMA_Target_lock_entry_t * target_lock_entry)
 {
     int mpi_errno = MPI_SUCCESS;
 
-    if (lock_entry->pkt.type == MPIDI_CH3_PKT_LOCK) {
+    if (target_lock_entry->pkt.type == MPIDI_CH3_PKT_LOCK) {
 
         /* single LOCK request */
 
-        MPIDI_CH3_Pkt_lock_t *lock_pkt = &(lock_entry->pkt.lock);
+        MPIDI_CH3_Pkt_lock_t *lock_pkt = &(target_lock_entry->pkt.lock);
         MPIDI_VC_t *my_vc = NULL;
 
         MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &my_vc);
 
-        if (lock_entry->vc == my_vc) {
+        if (target_lock_entry->vc == my_vc) {
             mpi_errno = handle_lock_ack(win_ptr, win_ptr->comm_ptr->rank,
                                         MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED);
             if (mpi_errno)
                 MPIU_ERR_POP(mpi_errno);
         }
         else {
-            mpi_errno = MPIDI_CH3I_Send_lock_ack_pkt(lock_entry->vc, win_ptr,
+            mpi_errno = MPIDI_CH3I_Send_lock_ack_pkt(target_lock_entry->vc, win_ptr,
                                                      MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED,
                                                      lock_pkt->source_win_handle,
                                                      lock_pkt->request_handle);
@@ -1744,44 +1755,44 @@ static inline int perform_op_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_en
     }
     else {
         /* LOCK+OP packet */
-        switch (lock_entry->pkt.type) {
+        switch (target_lock_entry->pkt.type) {
         case (MPIDI_CH3_PKT_PUT):
         case (MPIDI_CH3_PKT_PUT_IMMED):
-            mpi_errno = perform_put_in_lock_queue(win_ptr, lock_entry);
+            mpi_errno = perform_put_in_lock_queue(win_ptr, target_lock_entry);
             if (mpi_errno)
                 MPIU_ERR_POP(mpi_errno);
             break;
         case (MPIDI_CH3_PKT_GET):
-            mpi_errno = perform_get_in_lock_queue(win_ptr, lock_entry);
+            mpi_errno = perform_get_in_lock_queue(win_ptr, target_lock_entry);
             if (mpi_errno)
                 MPIU_ERR_POP(mpi_errno);
             break;
         case (MPIDI_CH3_PKT_ACCUMULATE):
         case (MPIDI_CH3_PKT_ACCUMULATE_IMMED):
-            mpi_errno = perform_acc_in_lock_queue(win_ptr, lock_entry);
+            mpi_errno = perform_acc_in_lock_queue(win_ptr, target_lock_entry);
             if (mpi_errno)
                 MPIU_ERR_POP(mpi_errno);
             break;
         case (MPIDI_CH3_PKT_GET_ACCUM):
         case (MPIDI_CH3_PKT_GET_ACCUM_IMMED):
-            mpi_errno = perform_get_acc_in_lock_queue(win_ptr, lock_entry);
+            mpi_errno = perform_get_acc_in_lock_queue(win_ptr, target_lock_entry);
             if (mpi_errno)
                 MPIU_ERR_POP(mpi_errno);
             break;
         case (MPIDI_CH3_PKT_FOP):
         case (MPIDI_CH3_PKT_FOP_IMMED):
-            mpi_errno = perform_fop_in_lock_queue(win_ptr, lock_entry);
+            mpi_errno = perform_fop_in_lock_queue(win_ptr, target_lock_entry);
             if (mpi_errno)
                 MPIU_ERR_POP(mpi_errno);
             break;
         case (MPIDI_CH3_PKT_CAS_IMMED):
-            mpi_errno = perform_cas_in_lock_queue(win_ptr, lock_entry);
+            mpi_errno = perform_cas_in_lock_queue(win_ptr, target_lock_entry);
             if (mpi_errno)
                 MPIU_ERR_POP(mpi_errno);
             break;
         default:
             MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER,
-                                 "**invalidpkt", "**invalidpkt %d", lock_entry->pkt.type);
+                                 "**invalidpkt", "**invalidpkt %d", target_lock_entry->pkt.type);
         }
     }
 
@@ -1803,7 +1814,7 @@ static int entered_count = 0;
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_CH3I_Release_lock(MPID_Win * win_ptr)
 {
-    MPIDI_RMA_Lock_entry_t *lock_entry, *lock_entry_next;
+    MPIDI_RMA_Target_lock_entry_t *target_lock_entry, *target_lock_entry_next;
     int requested_lock, mpi_errno = MPI_SUCCESS, temp_entered_count;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_RELEASE_LOCK);
 
@@ -1848,13 +1859,13 @@ int MPIDI_CH3I_Release_lock(MPID_Win * win_ptr)
              * only that one. */
 
             /* FIXME: MT: All queue accesses need to be made atomic */
-            lock_entry = (MPIDI_RMA_Lock_entry_t *) win_ptr->lock_queue;
-            while (lock_entry) {
-                lock_entry_next = lock_entry->next;
+            target_lock_entry = (MPIDI_RMA_Target_lock_entry_t *) win_ptr->target_lock_queue_head;
+            while (target_lock_entry) {
+                target_lock_entry_next = target_lock_entry->next;
 
-                if (lock_entry->all_data_recved) {
+                if (target_lock_entry->all_data_recved) {
                     MPIDI_CH3_Pkt_flags_t flags;
-                    MPIDI_CH3_PKT_RMA_GET_FLAGS(lock_entry->pkt, flags, mpi_errno);
+                    MPIDI_CH3_PKT_RMA_GET_FLAGS(target_lock_entry->pkt, flags, mpi_errno);
                     if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED)
                         requested_lock = MPI_LOCK_SHARED;
                     else {
@@ -1863,15 +1874,17 @@ int MPIDI_CH3I_Release_lock(MPID_Win * win_ptr)
                     }
                     if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, requested_lock) == 1) {
                         /* dequeue entry from lock queue */
-                        MPL_LL_DELETE(win_ptr->lock_queue, win_ptr->lock_queue_tail, lock_entry);
+                        MPL_LL_DELETE(win_ptr->target_lock_queue_head,
+                                      win_ptr->target_lock_queue_tail, target_lock_entry);
 
                         /* perform this OP */
-                        mpi_errno = perform_op_in_lock_queue(win_ptr, lock_entry);
+                        mpi_errno = perform_op_in_lock_queue(win_ptr, target_lock_entry);
                         if (mpi_errno != MPI_SUCCESS)
                             MPIU_ERR_POP(mpi_errno);
 
                         /* free this entry */
-                        mpi_errno = MPIDI_CH3I_Win_lock_entry_free(win_ptr, lock_entry);
+                        mpi_errno =
+                            MPIDI_CH3I_Win_target_lock_entry_free(win_ptr, target_lock_entry);
                         if (mpi_errno != MPI_SUCCESS)
                             MPIU_ERR_POP(mpi_errno);
 
@@ -1881,7 +1894,7 @@ int MPIDI_CH3I_Release_lock(MPID_Win * win_ptr)
                             break;
                     }
                 }
-                lock_entry = lock_entry_next;
+                target_lock_entry = target_lock_entry_next;
             }
         } while (temp_entered_count != entered_count);
 
@@ -1908,7 +1921,7 @@ int MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete(MPIDI_VC_t * vc,
     MPI_Win target_win_handle;
     MPID_Win *win_ptr = NULL;
     MPIDI_CH3_Pkt_flags_t flags;
-    MPIDI_RMA_Lock_entry_t *lock_queue_entry = rreq->dev.lock_queue_entry;
+    MPIDI_RMA_Target_lock_entry_t *target_lock_queue_entry = rreq->dev.target_lock_queue_entry;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_PIGGYBACKLOCKOPRECVCOMPLETE);
 
@@ -1919,25 +1932,26 @@ int MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete(MPIDI_VC_t * vc,
 
     /* Note that if we decided to drop op data, here we just need to complete this
      * request; otherwise we try to get the lock again in this handler. */
-    if (rreq->dev.lock_queue_entry != NULL) {
+    if (rreq->dev.target_lock_queue_entry != NULL) {
 
         /* Mark all data received in lock queue entry */
-        lock_queue_entry->all_data_recved = 1;
+        target_lock_queue_entry->all_data_recved = 1;
 
         /* try to acquire the lock here */
-        MPIDI_CH3_PKT_RMA_GET_FLAGS(lock_queue_entry->pkt, flags, mpi_errno);
-        MPIDI_CH3_PKT_RMA_GET_TARGET_WIN_HANDLE(lock_queue_entry->pkt, target_win_handle,
+        MPIDI_CH3_PKT_RMA_GET_FLAGS(target_lock_queue_entry->pkt, flags, mpi_errno);
+        MPIDI_CH3_PKT_RMA_GET_TARGET_WIN_HANDLE(target_lock_queue_entry->pkt, target_win_handle,
                                                 mpi_errno);
         MPID_Win_get_ptr(target_win_handle, win_ptr);
 
-        if (flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM && (rreq->dev.lock_queue_entry)->data != NULL) {
+        if (flags & MPIDI_CH3_PKT_FLAG_RMA_STREAM &&
+            (rreq->dev.target_lock_queue_entry)->data != NULL) {
 
-            MPIU_Assert(lock_queue_entry->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
-                        lock_queue_entry->pkt.type == MPIDI_CH3_PKT_GET_ACCUM);
+            MPIU_Assert(target_lock_queue_entry->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
+                        target_lock_queue_entry->pkt.type == MPIDI_CH3_PKT_GET_ACCUM);
 
             int ext_hdr_sz;
 
-            if (lock_queue_entry->pkt.type == MPIDI_CH3_PKT_ACCUMULATE)
+            if (target_lock_queue_entry->pkt.type == MPIDI_CH3_PKT_ACCUMULATE)
                 ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_accum_t);
             else
                 ext_hdr_sz = sizeof(MPIDI_CH3_Ext_pkt_get_accum_t);
@@ -1945,8 +1959,8 @@ int MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete(MPIDI_VC_t * vc,
             /* here we drop the stream_offset received, because the stream unit that piggybacked with
              * LOCK must be the first stream unit, with stream_offset equals to 0. */
             rreq->dev.recv_data_sz -= ext_hdr_sz;
-            memmove((rreq->dev.lock_queue_entry)->data,
-                    (void *) ((char *) ((rreq->dev.lock_queue_entry)->data) + ext_hdr_sz),
+            memmove((rreq->dev.target_lock_queue_entry)->data,
+                    (void *) ((char *) ((rreq->dev.target_lock_queue_entry)->data) + ext_hdr_sz),
                     rreq->dev.recv_data_sz);
         }
 
@@ -1960,15 +1974,16 @@ int MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete(MPIDI_VC_t * vc,
 
         if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, requested_lock) == 1) {
             /* dequeue entry from lock queue */
-            MPL_LL_DELETE(win_ptr->lock_queue, win_ptr->lock_queue_tail, lock_queue_entry);
+            MPL_LL_DELETE(win_ptr->target_lock_queue_head, win_ptr->target_lock_queue_tail,
+                          target_lock_queue_entry);
 
             /* perform this OP */
-            mpi_errno = perform_op_in_lock_queue(win_ptr, lock_queue_entry);
+            mpi_errno = perform_op_in_lock_queue(win_ptr, target_lock_queue_entry);
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
 
             /* free this entry */
-            mpi_errno = MPIDI_CH3I_Win_lock_entry_free(win_ptr, lock_queue_entry);
+            mpi_errno = MPIDI_CH3I_Win_target_lock_entry_free(win_ptr, target_lock_queue_entry);
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
         }
diff --git a/src/mpid/ch3/src/ch3u_request.c b/src/mpid/ch3/src/ch3u_request.c
index b2cd014..d62ae80 100644
--- a/src/mpid/ch3/src/ch3u_request.c
+++ b/src/mpid/ch3/src/ch3u_request.c
@@ -83,7 +83,7 @@ MPID_Request * MPID_Request_create(void)
 	   request for RMA operations */
 	req->dev.target_win_handle = MPI_WIN_NULL;
 	req->dev.source_win_handle = MPI_WIN_NULL;
-        req->dev.lock_queue_entry  = NULL;
+        req->dev.target_lock_queue_entry = NULL;
 	req->dev.dtype_info	   = NULL;
 	req->dev.dataloop	   = NULL;
 	req->dev.iov_offset        = 0;
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index ea10390..6fbca30 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -23,7 +23,7 @@ cvars:
         a linked list of target elements. The distribution of ranks among
         slots follows a round-robin pattern. Requires a positive value.
 
-    - name        : MPIR_CVAR_CH3_RMA_LOCK_DATA_BYTES
+    - name        : MPIR_CVAR_CH3_RMA_TARGET_LOCK_DATA_BYTES
       category    : CH3
       type        : int
       default     : 655360
@@ -309,8 +309,8 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     /* (*win_ptr)->basic_info_table[] is set by caller; */
     (*win_ptr)->current_lock_type = MPID_LOCK_NONE;
     (*win_ptr)->shared_lock_ref_cnt = 0;
-    (*win_ptr)->lock_queue = NULL;
-    (*win_ptr)->lock_queue_tail = NULL;
+    (*win_ptr)->target_lock_queue_head = NULL;
+    (*win_ptr)->target_lock_queue_tail = NULL;
     (*win_ptr)->shm_allocated = FALSE;
     (*win_ptr)->states.access_state = MPIDI_RMA_NONE;
     (*win_ptr)->states.exposure_state = MPIDI_RMA_NONE;
@@ -323,7 +323,7 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->lock_all_assert = 0;
     (*win_ptr)->lock_epoch_count = 0;
     (*win_ptr)->outstanding_locks = 0;
-    (*win_ptr)->current_lock_data_bytes = 0;
+    (*win_ptr)->current_target_lock_data_bytes = 0;
 
     /* Initialize the info flags */
     (*win_ptr)->info_args.no_locks = 0;
@@ -345,11 +345,11 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     MPIU_CHKPMEM_MALLOC((*win_ptr)->op_pool_start, MPIDI_RMA_Op_t *,
                         sizeof(MPIDI_RMA_Op_t) * MPIR_CVAR_CH3_RMA_OP_WIN_POOL_SIZE, mpi_errno,
                         "RMA op pool");
-    (*win_ptr)->op_pool = NULL;
+    (*win_ptr)->op_pool_head = NULL;
     (*win_ptr)->op_pool_tail = NULL;
     for (i = 0; i < MPIR_CVAR_CH3_RMA_OP_WIN_POOL_SIZE; i++) {
         (*win_ptr)->op_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN;
-        MPL_LL_APPEND((*win_ptr)->op_pool, (*win_ptr)->op_pool_tail,
+        MPL_LL_APPEND((*win_ptr)->op_pool_head, (*win_ptr)->op_pool_tail,
                       &((*win_ptr)->op_pool_start[i]));
     }
 
@@ -358,11 +358,11 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     MPIU_CHKPMEM_MALLOC((*win_ptr)->target_pool_start, MPIDI_RMA_Target_t *,
                         sizeof(MPIDI_RMA_Target_t) * win_target_pool_size, mpi_errno,
                         "RMA target pool");
-    (*win_ptr)->target_pool = NULL;
+    (*win_ptr)->target_pool_head = NULL;
     (*win_ptr)->target_pool_tail = NULL;
     for (i = 0; i < win_target_pool_size; i++) {
         (*win_ptr)->target_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN;
-        MPL_LL_APPEND((*win_ptr)->target_pool, (*win_ptr)->target_pool_tail,
+        MPL_LL_APPEND((*win_ptr)->target_pool_head, (*win_ptr)->target_pool_tail,
                       &((*win_ptr)->target_pool_start[i]));
     }
 
@@ -375,15 +375,17 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     }
 
     if (!(*win_ptr)->info_args.no_locks) {
-        MPIU_CHKPMEM_MALLOC((*win_ptr)->lock_entry_pool_start, MPIDI_RMA_Lock_entry_t *,
-                            sizeof(MPIDI_RMA_Lock_entry_t) *
-                            MPIR_CVAR_CH3_RMA_LOCK_ENTRY_WIN_POOL_SIZE, mpi_errno,
+        MPIU_CHKPMEM_MALLOC((*win_ptr)->target_lock_entry_pool_start,
+                            MPIDI_RMA_Target_lock_entry_t *,
+                            sizeof(MPIDI_RMA_Target_lock_entry_t) *
+                            MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE, mpi_errno,
                             "RMA lock entry pool");
-        (*win_ptr)->lock_entry_pool = NULL;
-        (*win_ptr)->lock_entry_pool_tail = NULL;
-        for (i = 0; i < MPIR_CVAR_CH3_RMA_LOCK_ENTRY_WIN_POOL_SIZE; i++) {
-            MPL_LL_APPEND((*win_ptr)->lock_entry_pool, (*win_ptr)->lock_entry_pool_tail,
-                          &((*win_ptr)->lock_entry_pool_start[i]));
+        (*win_ptr)->target_lock_entry_pool_head = NULL;
+        (*win_ptr)->target_lock_entry_pool_tail = NULL;
+        for (i = 0; i < MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE; i++) {
+            MPL_LL_APPEND((*win_ptr)->target_lock_entry_pool_head,
+                          (*win_ptr)->target_lock_entry_pool_tail,
+                          &((*win_ptr)->target_lock_entry_pool_start[i]));
         }
     }
 
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index 10bb7d7..35f5d2b 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -63,7 +63,7 @@ cvars:
         targets) that stores information about RMA targets that
         could not be issued immediatly.  Requires a positive value.
 
-    - name        : MPIR_CVAR_CH3_RMA_LOCK_ENTRY_WIN_POOL_SIZE
+    - name        : MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE
       category    : CH3
       type        : int
       default     : 256
@@ -79,9 +79,9 @@ cvars:
 */
 
 
-MPIDI_RMA_Op_t *global_rma_op_pool = NULL, *global_rma_op_pool_tail =
+MPIDI_RMA_Op_t *global_rma_op_pool_head = NULL, *global_rma_op_pool_tail =
     NULL, *global_rma_op_pool_start = NULL;
-MPIDI_RMA_Target_t *global_rma_target_pool = NULL, *global_rma_target_pool_tail =
+MPIDI_RMA_Target_t *global_rma_target_pool_head = NULL, *global_rma_target_pool_tail =
     NULL, *global_rma_target_pool_start = NULL;
 
 #undef FUNCNAME
@@ -103,7 +103,8 @@ int MPIDI_RMA_init(void)
                         mpi_errno, "RMA op pool");
     for (i = 0; i < MPIR_CVAR_CH3_RMA_OP_GLOBAL_POOL_SIZE; i++) {
         global_rma_op_pool_start[i].pool_type = MPIDI_RMA_POOL_GLOBAL;
-        MPL_LL_APPEND(global_rma_op_pool, global_rma_op_pool_tail, &(global_rma_op_pool_start[i]));
+        MPL_LL_APPEND(global_rma_op_pool_head, global_rma_op_pool_tail,
+                      &(global_rma_op_pool_start[i]));
     }
 
     MPIU_CHKPMEM_MALLOC(global_rma_target_pool_start, MPIDI_RMA_Target_t *,
@@ -111,7 +112,7 @@ int MPIDI_RMA_init(void)
                         mpi_errno, "RMA target pool");
     for (i = 0; i < MPIR_CVAR_CH3_RMA_TARGET_GLOBAL_POOL_SIZE; i++) {
         global_rma_target_pool_start[i].pool_type = MPIDI_RMA_POOL_GLOBAL;
-        MPL_LL_APPEND(global_rma_target_pool, global_rma_target_pool_tail,
+        MPL_LL_APPEND(global_rma_target_pool_head, global_rma_target_pool_tail,
                       &(global_rma_target_pool_start[i]));
     }
 
@@ -189,7 +190,8 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
      * entering Win_free. */
     while ((*win_ptr)->current_lock_type != MPID_LOCK_NONE ||
            (*win_ptr)->at_completion_counter != 0 ||
-           (*win_ptr)->lock_queue != NULL || (*win_ptr)->current_lock_data_bytes != 0) {
+           (*win_ptr)->target_lock_queue_head != NULL ||
+           (*win_ptr)->current_target_lock_data_bytes != 0) {
         mpi_errno = wait_progress_engine();
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
@@ -226,10 +228,10 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
     MPIU_Free((*win_ptr)->op_pool_start);
     MPIU_Free((*win_ptr)->target_pool_start);
     MPIU_Free((*win_ptr)->slots);
-    if ((*win_ptr)->lock_entry_pool_start != NULL) {
-        MPIU_Free((*win_ptr)->lock_entry_pool_start);
+    if ((*win_ptr)->target_lock_entry_pool_start != NULL) {
+        MPIU_Free((*win_ptr)->target_lock_entry_pool_start);
     }
-    MPIU_Assert((*win_ptr)->current_lock_data_bytes == 0);
+    MPIU_Assert((*win_ptr)->current_target_lock_data_bytes == 0);
 
     /* Free the attached buffer for windows created with MPI_Win_allocate() */
     if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE ||

http://git.mpich.org/mpich.git/commitdiff/7c39124f49b33c88557ab8e6a6a4bb5fcdf8ce37

commit 7c39124f49b33c88557ab8e6a6a4bb5fcdf8ce37
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Wed Jun 17 09:49:40 2015 -0500

    Delete a CVAR that is never used.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index 607d0f1..10bb7d7 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -75,18 +75,6 @@ cvars:
         lock entries) that stores information about RMA lock requests that
         could not be satisfied immediatly.  Requires a positive value.
 
-    - name        : MPIR_CVAR_CH3_RMA_LOCK_ENTRY_GLOBAL_POOL_SIZE
-      category    : CH3
-      type        : int
-      default     : 16384
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_ALL_EQ
-      description : >-
-        Size of the Global RMA lock entries pool (in number of
-        lock entries) that stores information about RMA lock requests that
-        could not be satisfied immediatly.  Requires a positive value.
-
 === END_MPI_T_CVAR_INFO_BLOCK ===
 */
 

http://git.mpich.org/mpich.git/commitdiff/fb72c9ce66ad60e0dc419390e215eab11b891b2c

commit fb72c9ce66ad60e0dc419390e215eab11b891b2c
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Jun 23 19:52:06 2015 -0500

    Delete CVAR and code that control the batch issuing of RMA operations.
    
    Since it does not help on performance.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 81eec34..95362f6 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -142,7 +142,6 @@ static inline MPIDI_RMA_Target_t *MPIDI_CH3I_Win_target_alloc(MPID_Win * win_ptr
     e->access_state = MPIDI_RMA_NONE;
     e->lock_type = MPID_LOCK_NONE;
     e->lock_mode = 0;
-    e->accumulated_ops_cnt = 0;
     e->win_complete_flag = 0;
     e->put_acc_issued = 0;
 
@@ -302,10 +301,6 @@ static inline int MPIDI_CH3I_Win_enqueue_op(MPID_Win * win_ptr, MPIDI_RMA_Op_t *
     if (target->next_op_to_issue == NULL)
         target->next_op_to_issue = op;
 
-    /* Increment the counter for accumulated posted operations */
-    target->accumulated_ops_cnt++;
-    win_ptr->accumulated_ops_cnt++;
-
   fn_exit:
     return mpi_errno;
   fn_fail:
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index e8473e3..b3e1b30 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -92,7 +92,6 @@ typedef struct MPIDI_RMA_Target {
     enum MPIDI_RMA_states access_state;
     int lock_type;              /* NONE, SHARED, EXCLUSIVE */
     int lock_mode;              /* e.g., MODE_NO_CHECK */
-    int accumulated_ops_cnt;
     int win_complete_flag;
     int put_acc_issued;         /* indicate if PUT/ACC is issued in this epoch
                                  * after the previous synchronization calls. */
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index 11a32e3..2fd203b 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -327,9 +327,6 @@ typedef struct MPIDI_Win_basic_info {
         enum MPIDI_RMA_states exposure_state;                            \
     } states;                                                            \
     int non_empty_slots;                                                 \
-    int accumulated_ops_cnt; /* keep track of number of accumulated posted RMA operations \
-                            in current epoch to control when to poke     \
-                            progress engine in RMA operation routines. */\
     int active_req_cnt; /* keep track of number of active requests in    \
                            current epoch, i.e., number of issued but     \
                            incomplete RMA operations. */                 \
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 39f9f57..d90492e 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -12,23 +12,6 @@ MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);
 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
 
 cvars:
-    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
-      category    : CH3
-      type        : int
-      default     : 100
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_ALL_EQ
-      description : >-
-          Specify the threshold of number of posted operations
-          when starting poking progress in operation routines.
-          When the value is negative, runtime never pokes progress
-          engine in operation routines; when the value is zero,
-          runtime always pokes progress engine in operation
-          routines; when the value is larger than zero, runtime
-          starts to poke progress engine when number of posted
-          operations reaches that value.
-
     - name        : MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE
       category    : CH3
       type        : int
@@ -205,13 +188,6 @@ int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
 
-        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
-            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
-            mpi_errno = poke_progress_engine();
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-        }
-
         if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
             win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
             while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
@@ -383,13 +359,6 @@ int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
 
-        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
-            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
-            mpi_errno = poke_progress_engine();
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-        }
-
         if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
             win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
             while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
@@ -603,13 +572,6 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
 
-        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
-            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
-            mpi_errno = poke_progress_engine();
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-        }
-
         if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
             win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
             while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
@@ -864,13 +826,6 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
 
-        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
-            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
-            mpi_errno = poke_progress_engine();
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-        }
-
         if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
             win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
             while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
@@ -1123,13 +1078,6 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
 
-        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
-            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
-            mpi_errno = poke_progress_engine();
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-        }
-
         if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
             win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
             while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
@@ -1277,13 +1225,6 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
 
-        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
-            win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
-            mpi_errno = poke_progress_engine();
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-        }
-
         if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
             win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
             while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index 6b732b3..d8ba0b5 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -532,15 +532,6 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
     /* Make sure that all targets are freed. */
     MPIU_Assert(win_ptr->non_empty_slots == 0);
 
-    if (assert & MPI_MODE_NOPRECEDE) {
-        /* BEGINNING synchronization: the following counter should be zero. */
-        MPIU_Assert(win_ptr->accumulated_ops_cnt == 0);
-    }
-    else {
-        /* ENDING synchronization: correctly decrement the following counter. */
-        win_ptr->accumulated_ops_cnt = 0;
-    }
-
     MPIU_Assert(win_ptr->active_req_cnt == 0);
 
     /* Ensure ordering of load/store operations. */
@@ -764,9 +755,6 @@ int MPIDI_Win_start(MPID_Group * group_ptr, int assert, MPID_Win * win_ptr)
     win_ptr->states.access_state = MPIDI_RMA_PSCW_ISSUED;
     MPIDI_CH3I_num_active_issued_win++;
 
-    /* BEGINNING synchronization: the following counter should be zero. */
-    MPIU_Assert(win_ptr->accumulated_ops_cnt == 0);
-
     MPIU_Assert(win_ptr->active_req_cnt == 0);
 
     /* Ensure ordering of load/store operations. */
@@ -884,9 +872,6 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
     /* Make sure that all targets are freed. */
     MPIU_Assert(win_ptr->non_empty_slots == 0);
 
-    /* ENDING synchronization: correctly decrement the following counter. */
-    win_ptr->accumulated_ops_cnt = 0;
-
     MPIU_Assert(win_ptr->active_req_cnt == 0);
 
   fn_exit:
@@ -1074,11 +1059,6 @@ int MPIDI_Win_lock(int lock_type, int dest, int assert, MPID_Win * win_ptr)
     }
 
   finish_lock:
-    if (win_ptr->lock_epoch_count == 1) {
-        /* BEGINNING synchronization: the following counter should be zero. */
-        MPIU_Assert(win_ptr->accumulated_ops_cnt == 0);
-    }
-
     /* Ensure ordering of load/store operations. */
     if (win_ptr->shm_allocated == TRUE) {
         OPA_read_write_barrier();
@@ -1182,12 +1162,6 @@ int MPIDI_Win_unlock(int dest, MPID_Win * win_ptr)
     }
 
     if (target != NULL) {
-        /* ENDING synchronization: correctly decrement the following counter. */
-        win_ptr->accumulated_ops_cnt -= target->accumulated_ops_cnt;
-        if (win_ptr->lock_epoch_count == 0) {
-            MPIU_Assert(win_ptr->accumulated_ops_cnt == 0);
-        }
-
         /* Cleanup the target. */
         mpi_errno = MPIDI_CH3I_Win_target_dequeue_and_free(win_ptr, target);
         if (mpi_errno != MPI_SUCCESS)
@@ -1289,12 +1263,6 @@ int MPIDI_Win_flush(int dest, MPID_Win * win_ptr)
             MPIU_ERR_POP(mpi_errno);
     }
 
-    if (target != NULL) {
-        /* ENDING synchronization: correctly decrement the following counters. */
-        win_ptr->accumulated_ops_cnt -= target->accumulated_ops_cnt;
-        target->accumulated_ops_cnt = 0;
-    }
-
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FLUSH);
     return mpi_errno;
@@ -1388,10 +1356,6 @@ int MPIDI_Win_flush_local(int dest, MPID_Win * win_ptr)
     if (target != NULL) {
         /* reset upgrade_flush_local flag in target to 0 */
         target->sync.upgrade_flush_local = 0;
-
-        /* ENDING synchronization: correctly decrement the following counters. */
-        win_ptr->accumulated_ops_cnt -= target->accumulated_ops_cnt;
-        target->accumulated_ops_cnt = 0;
     }
 
   fn_exit:
@@ -1470,9 +1434,6 @@ int MPIDI_Win_lock_all(int assert, MPID_Win * win_ptr)
     }
 
   finish_lock_all:
-    /* BEGINNING synchronization: the following counter should be zero. */
-    MPIU_Assert(win_ptr->accumulated_ops_cnt == 0);
-
     MPIU_Assert(win_ptr->active_req_cnt == 0);
 
     /* Ensure ordering of load/store operations. */
@@ -1624,9 +1585,6 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
     /* Make sure that all targets are freed. */
     MPIU_Assert(win_ptr->non_empty_slots == 0);
 
-    /* ENDING synchronization: correctly decrement the following counter. */
-    win_ptr->accumulated_ops_cnt = 0;
-
     MPIU_Assert(win_ptr->active_req_cnt == 0);
 
   fn_exit:
@@ -1672,9 +1630,6 @@ int MPIDI_Win_flush_all(MPID_Win * win_ptr)
                 curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
             }
 
-            /* ENDING synchronization: correctly decrement the following counters. */
-            curr_target->accumulated_ops_cnt = 0;
-
             curr_target = curr_target->next;
         }
     }
@@ -1697,9 +1652,6 @@ int MPIDI_Win_flush_all(MPID_Win * win_ptr)
     } while (!remote_completed);
 
   finish_flush_all:
-    /* ENDING synchronization: correctly decrement the following counter. */
-    win_ptr->accumulated_ops_cnt = 0;
-
     MPIU_Assert(win_ptr->active_req_cnt == 0);
 
   fn_exit:
@@ -1756,9 +1708,6 @@ int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
                 enable_flush_local_cnt++;
             }
 
-            /* ENDING synchronization: correctly decrement the following counters. */
-            curr_target->accumulated_ops_cnt = 0;
-
             curr_target = curr_target->next;
         }
     }
@@ -1816,9 +1765,6 @@ int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
         }
     }
 
-    /* ENDING synchronization: correctly decrement the following counter. */
-    win_ptr->accumulated_ops_cnt = 0;
-
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FLUSH_LOCAL_ALL);
     return mpi_errno;
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index 0727994..ea10390 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -315,7 +315,6 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->states.access_state = MPIDI_RMA_NONE;
     (*win_ptr)->states.exposure_state = MPIDI_RMA_NONE;
     (*win_ptr)->non_empty_slots = 0;
-    (*win_ptr)->accumulated_ops_cnt = 0;
     (*win_ptr)->active_req_cnt = 0;
     (*win_ptr)->fence_sync_req = MPI_REQUEST_NULL;
     (*win_ptr)->start_req = NULL;

http://git.mpich.org/mpich.git/commitdiff/3fdf2c073255c55d09e914e2aebca08e4be88e31

commit 3fdf2c073255c55d09e914e2aebca08e4be88e31
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Thu Jun 11 15:53:26 2015 -0500

    Perf-tuning: avoid dynamic allocation of requests in RMA.
    
    Originally we always dynamically allocate a request array
    for the current RMA operation, since the current operation
    might be streamed and needs multiple requests to track
    each stream unit. However, in most cases where streaming is
    not happening, we only needs one request for each operation
    and does not need to dynamically allocate it. This patch
    optimizes such case.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index b13ed88..cd32c7d 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -505,7 +505,6 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_put_t *put_pkt = &rma_op->pkt.put;
     MPID_Request *curr_req = NULL;
-    int i, curr_req_index = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_PUT_OP);
 
@@ -534,11 +533,7 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     if (curr_req != NULL) {
         rma_op->reqs_size = 1;
 
-        rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
-        for (i = 0; i < rma_op->reqs_size; i++)
-            rma_op->reqs[i] = NULL;
-
-        rma_op->reqs[curr_req_index] = curr_req;
+        rma_op->single_req = curr_req;
         win_ptr->active_req_cnt++;
     }
 
@@ -547,10 +542,7 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
-    if (rma_op->reqs != NULL) {
-        MPIU_Free(rma_op->reqs);
-    }
-    rma_op->reqs = NULL;
+    rma_op->single_req = NULL;
     rma_op->reqs_size = 0;
     goto fn_exit;
     /* --END ERROR HANDLING-- */
@@ -593,16 +585,10 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
 
         if (curr_req != NULL) {
-            MPIU_Assert(rma_op->reqs_size == 0 && rma_op->reqs == NULL);
+            MPIU_Assert(rma_op->reqs_size == 0 && rma_op->single_req == NULL);
 
             rma_op->reqs_size = 1;
-
-            rma_op->reqs =
-                (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
-            for (i = 0; i < rma_op->reqs_size; i++)
-                rma_op->reqs[i] = NULL;
-
-            rma_op->reqs[0] = curr_req;
+            rma_op->single_req = curr_req;
             win_ptr->active_req_cnt++;
         }
         goto fn_exit;
@@ -669,16 +655,21 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
 
         if (curr_req != NULL) {
             if (rma_op->reqs_size == 0) {
-                MPIU_Assert(rma_op->reqs == NULL);
+                MPIU_Assert(rma_op->single_req == NULL && rma_op->multi_reqs == NULL);
                 rma_op->reqs_size = stream_unit_count;
 
-                rma_op->reqs =
-                    (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
-                for (i = 0; i < rma_op->reqs_size; i++)
-                    rma_op->reqs[i] = NULL;
+                if (stream_unit_count > 1) {
+                    rma_op->multi_reqs =
+                        (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
+                    for (i = 0; i < rma_op->reqs_size; i++)
+                        rma_op->multi_reqs[i] = NULL;
+                }
             }
 
-            rma_op->reqs[j] = curr_req;
+            if (rma_op->reqs_size == 1)
+                rma_op->single_req = curr_req;
+            else
+                rma_op->multi_reqs[j] = curr_req;
             win_ptr->active_req_cnt++;
         }
 
@@ -700,10 +691,15 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_ACC_OP);
     return mpi_errno;
   fn_fail:
-    if (rma_op->reqs != NULL) {
-        MPIU_Free(rma_op->reqs);
+    if (rma_op->reqs_size == 1) {
+        rma_op->single_req = NULL;
+    }
+    else if (rma_op->reqs_size > 1) {
+        if (rma_op->multi_reqs != NULL) {
+            MPIU_Free(rma_op->multi_reqs);
+            rma_op->multi_reqs = NULL;
+        }
     }
-    rma_op->reqs = NULL;
     rma_op->reqs_size = 0;
     goto fn_exit;
 }
@@ -741,10 +737,6 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
 
         rma_op->reqs_size = 1;
 
-        rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
-        for (i = 0; i < rma_op->reqs_size; i++)
-            rma_op->reqs[i] = NULL;
-
         /* Create a request for the GACC response.  Store the response buf, count, and
          * datatype in it, and pass the request's handle in the GACC packet. When the
          * response comes from the target, it will contain the request handle. */
@@ -779,7 +771,7 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         /* For error checking */
         resp_req = NULL;
 
-        rma_op->reqs[0] = curr_req;
+        rma_op->single_req = curr_req;
         win_ptr->active_req_cnt++;
 
         goto fn_exit;
@@ -819,9 +811,12 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
 
     rma_op->reqs_size = stream_unit_count;
 
-    rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
-    for (i = 0; i < rma_op->reqs_size; i++)
-        rma_op->reqs[i] = NULL;
+    if (rma_op->reqs_size > 1) {
+        rma_op->multi_reqs =
+            (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
+        for (i = 0; i < rma_op->reqs_size; i++)
+            rma_op->multi_reqs[i] = NULL;
+    }
 
     MPIU_Assert(rma_op->issued_stream_count >= 0);
 
@@ -919,7 +914,11 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         /* For error checking */
         resp_req = NULL;
 
-        rma_op->reqs[j] = curr_req;
+        if (rma_op->reqs_size == 1)
+            rma_op->single_req = curr_req;
+        else
+            rma_op->multi_reqs[j] = curr_req;
+
         win_ptr->active_req_cnt++;
 
         rma_op->issued_stream_count++;
@@ -941,15 +940,19 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
-    for (i = 0; i < rma_op->reqs_size; i++) {
-        if (rma_op->reqs[i] != NULL) {
-            MPIDI_CH3_Request_destroy(rma_op->reqs[i]);
-        }
+    if (rma_op->reqs_size == 1) {
+        MPIDI_CH3_Request_destroy(rma_op->single_req);
+        rma_op->single_req = NULL;
     }
-    if (rma_op->reqs != NULL) {
-        MPIU_Free(rma_op->reqs);
+    else if (rma_op->reqs_size > 1) {
+        for (i = 0; i < rma_op->reqs_size; i++) {
+            if (rma_op->multi_reqs[i] != NULL) {
+                MPIDI_CH3_Request_destroy(rma_op->multi_reqs[i]);
+            }
+        }
+        MPIU_Free(rma_op->multi_reqs);
+        rma_op->multi_reqs = NULL;
     }
-    rma_op->reqs = NULL;
     rma_op->reqs_size = 0;
     goto fn_exit;
     /* --END ERROR HANDLING-- */
@@ -971,7 +974,6 @@ static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPI_Datatype target_datatype;
     MPID_Request *req = NULL;
     MPID_Request *curr_req = NULL;
-    int i, curr_req_index = 0;
     MPID_IOV iov[MPID_IOV_LIMIT];
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_GET_OP);
 
@@ -979,10 +981,6 @@ static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
 
     rma_op->reqs_size = 1;
 
-    rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
-    for (i = 0; i < rma_op->reqs_size; i++)
-        rma_op->reqs[i] = NULL;
-
     /* create a request, store the origin buf, cnt, datatype in it,
      * and pass a handle to it in the get packet. When the get
      * response comes from the target, it will contain the request
@@ -1056,7 +1054,7 @@ static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPID_Request_release(req);
     }
 
-    rma_op->reqs[curr_req_index] = curr_req;
+    rma_op->single_req = curr_req;
     win_ptr->active_req_cnt++;
 
   fn_exit:
@@ -1064,15 +1062,7 @@ static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
-    for (i = 0; i < rma_op->reqs_size; i++) {
-        if (rma_op->reqs[i] != NULL) {
-            MPIDI_CH3_Request_destroy(rma_op->reqs[i]);
-        }
-    }
-    if (rma_op->reqs != NULL) {
-        MPIU_Free(rma_op->reqs);
-    }
-    rma_op->reqs = NULL;
+    rma_op->single_req = NULL;
     rma_op->reqs_size = 0;
     goto fn_exit;
     /* --END ERROR HANDLING-- */
@@ -1092,7 +1082,6 @@ static int issue_cas_op(MPIDI_RMA_Op_t * rma_op,
     MPIDI_CH3_Pkt_cas_t *cas_pkt = &rma_op->pkt.cas;
     MPID_Request *rmw_req = NULL;
     MPID_Request *curr_req = NULL;
-    int i, curr_req_index = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_CAS_OP);
 
@@ -1100,10 +1089,6 @@ static int issue_cas_op(MPIDI_RMA_Op_t * rma_op,
 
     rma_op->reqs_size = 1;
 
-    rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
-    for (i = 0; i < rma_op->reqs_size; i++)
-        rma_op->reqs[i] = NULL;
-
     /* Create a request for the RMW response.  Store the origin buf, count, and
      * datatype in it, and pass the request's handle RMW packet. When the
      * response comes from the target, it will contain the request handle. */
@@ -1133,7 +1118,7 @@ static int issue_cas_op(MPIDI_RMA_Op_t * rma_op,
         MPID_Request_release(rmw_req);
     }
 
-    rma_op->reqs[curr_req_index] = curr_req;
+    rma_op->single_req = curr_req;
     win_ptr->active_req_cnt++;
 
   fn_exit:
@@ -1141,15 +1126,7 @@ static int issue_cas_op(MPIDI_RMA_Op_t * rma_op,
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
-    for (i = 0; i < rma_op->reqs_size; i++) {
-        if (rma_op->reqs[i] != NULL) {
-            MPIDI_CH3_Request_destroy(rma_op->reqs[i]);
-        }
-    }
-    if (rma_op->reqs != NULL) {
-        MPIU_Free(rma_op->reqs);
-    }
-    rma_op->reqs = NULL;
+    rma_op->single_req = NULL;
     rma_op->reqs_size = 0;
     goto fn_exit;
     /* --END ERROR HANDLING-- */
@@ -1169,7 +1146,6 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
     MPIDI_CH3_Pkt_fop_t *fop_pkt = &rma_op->pkt.fop;
     MPID_Request *resp_req = NULL;
     MPID_Request *curr_req = NULL;
-    int i, curr_req_index = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_FOP_OP);
 
@@ -1177,10 +1153,6 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
 
     rma_op->reqs_size = 1;
 
-    rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
-    for (i = 0; i < rma_op->reqs_size; i++)
-        rma_op->reqs[i] = NULL;
-
     /* Create a request for the GACC response.  Store the response buf, count, and
      * datatype in it, and pass the request's handle in the GACC packet. When the
      * response comes from the target, it will contain the request handle. */
@@ -1243,7 +1215,7 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
     /* For error checking */
     resp_req = NULL;
 
-    rma_op->reqs[curr_req_index] = curr_req;
+    rma_op->single_req = curr_req;
     win_ptr->active_req_cnt++;
 
   fn_exit:
@@ -1251,15 +1223,7 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
-    for (i = 0; i < rma_op->reqs_size; i++) {
-        if (rma_op->reqs[i] != NULL) {
-            MPIDI_CH3_Request_destroy(rma_op->reqs[i]);
-        }
-    }
-    if (rma_op->reqs != NULL) {
-        MPIU_Free(rma_op->reqs);
-    }
-    rma_op->reqs = NULL;
+    rma_op->single_req = NULL;
     rma_op->reqs_size = 0;
     goto fn_exit;
     /* --END ERROR HANDLING-- */
@@ -1335,7 +1299,7 @@ static inline int set_user_req_after_issuing_op(MPIDI_RMA_Op_t * op)
         goto fn_exit;
 
     if (op->reqs_size == 0) {
-        MPIU_Assert(op->reqs == NULL);
+        MPIU_Assert(op->single_req == NULL && op->multi_reqs == NULL);
         /* Sending is completed immediately, complete user request
          * and release ch3 ref. */
 
@@ -1343,17 +1307,24 @@ static inline int set_user_req_after_issuing_op(MPIDI_RMA_Op_t * op)
         MPIDI_CH3U_Request_complete(op->ureq);
     }
     else {
+        MPID_Request **req_ptr = NULL;
+
         /* Sending is not completed immediately. */
 
+        if (op->reqs_size == 1)
+            req_ptr = &(op->single_req);
+        else
+            req_ptr = op->multi_reqs;
+
         for (i = 0; i < op->reqs_size; i++) {
-            if (op->reqs[i] == NULL || MPID_Request_is_complete(op->reqs[i]))
+            if (req_ptr[i] == NULL || MPID_Request_is_complete(req_ptr[i]))
                 continue;
 
             /* Setup user request info in order to be completed following send request. */
             incomplete_req_cnt++;
             MPID_cc_set(&(op->ureq->cc), incomplete_req_cnt);   /* increment CC counter */
 
-            op->reqs[i]->dev.request_handle = op->ureq->handle;
+            req_ptr[i]->dev.request_handle = op->ureq->handle;
 
             /* Setup user request completion handler.
              *
@@ -1372,10 +1343,10 @@ static inline int set_user_req_after_issuing_op(MPIDI_RMA_Op_t * op)
              * last segment, so it is also correct for us.
              *
              * TODO: implement stack for overriding functions*/
-            if (op->reqs[i]->dev.OnDataAvail == NULL) {
-                op->reqs[i]->dev.OnDataAvail = MPIDI_CH3_ReqHandler_ReqOpsComplete;
+            if (req_ptr[i]->dev.OnDataAvail == NULL) {
+                req_ptr[i]->dev.OnDataAvail = MPIDI_CH3_ReqHandler_ReqOpsComplete;
             }
-            op->reqs[i]->dev.OnFinal = MPIDI_CH3_ReqHandler_ReqOpsComplete;
+            req_ptr[i]->dev.OnFinal = MPIDI_CH3_ReqHandler_ReqOpsComplete;
         }       /* end of for loop */
 
         if (incomplete_req_cnt) {
diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 4cfebf2..81eec34 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -67,7 +67,8 @@ static inline MPIDI_RMA_Op_t *MPIDI_CH3I_Win_op_alloc(MPID_Win * win_ptr)
     }
 
     e->dataloop = NULL;
-    e->reqs = NULL;
+    e->single_req = NULL;
+    e->multi_reqs = NULL;
     e->reqs_size = 0;
     e->ureq = NULL;
     e->piggyback_lock_candidate = 0;
@@ -383,27 +384,56 @@ static inline int MPIDI_CH3I_RMA_Cleanup_ops_target(MPID_Win * win_ptr, MPIDI_RM
     curr_op = *op_list_head;
     while (1) {
         if (curr_op != NULL) {
-            for (i = 0; i < curr_op->reqs_size; i++) {
-                if (curr_op->reqs[i] == NULL)
-                    continue;
+            int completed = 0;
 
-                if (MPID_Request_is_complete(curr_op->reqs[i])) {
+            MPIU_Assert(curr_op->reqs_size > 0);
+            if (curr_op->reqs_size == 1) {
+                /* single_req is used */
+
+                if (MPID_Request_is_complete(curr_op->single_req)) {
                     /* If there's an error, return it */
-                    mpi_errno = curr_op->reqs[i]->status.MPI_ERROR;
+                    mpi_errno = curr_op->single_req->status.MPI_ERROR;
                     MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
 
                     /* No errors, free the request */
-                    MPID_Request_release(curr_op->reqs[i]);
+                    MPID_Request_release(curr_op->single_req);
 
-                    curr_op->reqs[i] = NULL;
+                    curr_op->single_req = NULL;
 
                     win_ptr->active_req_cnt--;
+
+                    completed = 1;
                 }
                 else
                     break;
             }
+            else {
+                /* multi_reqs is used */
+                for (i = 0; i < curr_op->reqs_size; i++) {
+                    if (curr_op->multi_reqs[i] == NULL)
+                        continue;
+
+                    if (MPID_Request_is_complete(curr_op->multi_reqs[i])) {
+                        /* If there's an error, return it */
+                        mpi_errno = curr_op->multi_reqs[i]->status.MPI_ERROR;
+                        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
+
+                        /* No errors, free the request */
+                        MPID_Request_release(curr_op->multi_reqs[i]);
+
+                        curr_op->multi_reqs[i] = NULL;
+
+                        win_ptr->active_req_cnt--;
+                    }
+                    else
+                        break;
+                }
+
+                if (i == curr_op->reqs_size)
+                    completed = 1;
+            }
 
-            if (i == curr_op->reqs_size) {
+            if (completed) {
                 /* Release user request */
                 if (curr_op->ureq) {
                     /* User request must be completed by progress engine */
@@ -413,9 +443,14 @@ static inline int MPIDI_CH3I_RMA_Cleanup_ops_target(MPID_Win * win_ptr, MPIDI_RM
                     MPID_Request_release(curr_op->ureq);
                 }
 
-                /* free request array in op struct */
-                MPIU_Free(curr_op->reqs);
-                curr_op->reqs = NULL;
+                if (curr_op->reqs_size == 1) {
+                    curr_op->single_req = NULL;
+                }
+                else {
+                    /* free request array in op struct */
+                    MPIU_Free(curr_op->multi_reqs);
+                    curr_op->multi_reqs = NULL;
+                }
                 curr_op->reqs_size = 0;
 
                 /* dequeue the operation and free it */
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index a456d2f..e8473e3 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -60,8 +60,11 @@ typedef struct MPIDI_RMA_Op {
     int result_count;
     MPI_Datatype result_datatype;
 
-    struct MPID_Request **reqs;
-    MPI_Aint reqs_size;
+    struct MPID_Request *single_req;    /* used for unstreamed RMA ops */
+    struct MPID_Request **multi_reqs;   /* used for streamed RMA ops */
+    MPI_Aint reqs_size;         /* when reqs_size == 0, neither single_req nor multi_reqs is used;
+                                 * when reqs_size == 1, single_req is used;
+                                 * when reqs_size > 1, multi_reqs is used. */
 
     MPIDI_RMA_dtype_info dtype_info;
     void *dataloop;
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index bfaa7a7..09fe27d 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -639,7 +639,7 @@ static inline int adjust_op_piggybacked_with_lock(MPID_Win * win_ptr,
             }
 
             if (op->reqs_size == 0) {
-                MPIU_Assert(op->reqs == NULL);
+                MPIU_Assert(op->single_req == NULL && op->multi_reqs == NULL);
                 MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, &(target->pending_op_list_head),
                                              &(target->pending_op_list_tail), op);
             }
@@ -683,18 +683,25 @@ static inline int adjust_op_piggybacked_with_lock(MPID_Win * win_ptr,
             /* We need to re-transmit this operation, so we destroy
              * the internal request and erase all flags in current
              * operation. */
-            if (op->reqs_size > 0) {
-                MPIU_Assert(op->reqs != NULL);
+            if (op->reqs_size == 1) {
+                MPIU_Assert(op->single_req != NULL);
+                MPIDI_CH3_Request_destroy(op->single_req);
+                op->single_req = NULL;
+                win_ptr->active_req_cnt--;
+                op->reqs_size = 0;
+            }
+            else if (op->reqs_size > 1) {
+                MPIU_Assert(op->multi_reqs != NULL);
                 for (i = 0; i < op->reqs_size; i++) {
-                    if (op->reqs[i] != NULL) {
-                        MPIDI_CH3_Request_destroy(op->reqs[i]);
-                        op->reqs[i] = NULL;
+                    if (op->multi_reqs[i] != NULL) {
+                        MPIDI_CH3_Request_destroy(op->multi_reqs[i]);
+                        op->multi_reqs[i] = NULL;
                         win_ptr->active_req_cnt--;
                     }
                 }
                 /* free req array in this op */
-                MPIU_Free(op->reqs);
-                op->reqs = NULL;
+                MPIU_Free(op->multi_reqs);
+                op->multi_reqs = NULL;
                 op->reqs_size = 0;
             }
             MPIDI_CH3_PKT_RMA_ERASE_FLAGS(op->pkt, mpi_errno);
diff --git a/src/mpid/ch3/src/ch3u_rma_progress.c b/src/mpid/ch3/src/ch3u_rma_progress.c
index a9daf45..c56c658 100644
--- a/src/mpid/ch3/src/ch3u_rma_progress.c
+++ b/src/mpid/ch3/src/ch3u_rma_progress.c
@@ -410,7 +410,7 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
         }
 
         if (curr_op->reqs_size == 0) {
-            MPIU_Assert(curr_op->reqs == NULL);
+            MPIU_Assert(curr_op->single_req == NULL && curr_op->multi_reqs == NULL);
             /* Sending is completed immediately. */
             MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, &(target->pending_op_list_head),
                                          &(target->pending_op_list_tail), curr_op);
@@ -530,19 +530,26 @@ int MPIDI_CH3I_RMA_Free_ops_before_completion(MPID_Win * win_ptr)
     /* free all ops in the list since we do not need to maintain them anymore */
     while (1) {
         if (curr_op != NULL) {
-            if (curr_op->reqs_size > 0) {
-                MPIU_Assert(curr_op->reqs != NULL);
+            if (curr_op->reqs_size == 1) {
+                MPIU_Assert(curr_op->single_req != NULL);
+                MPID_Request_release(curr_op->single_req);
+                curr_op->single_req = NULL;
+                win_ptr->active_req_cnt--;
+                curr_op->reqs_size = 0;
+            }
+            else if (curr_op->reqs_size > 1) {
+                MPIU_Assert(curr_op->multi_reqs != NULL);
                 for (i = 0; i < curr_op->reqs_size; i++) {
-                    if (curr_op->reqs[i] != NULL) {
-                        MPID_Request_release(curr_op->reqs[i]);
-                        curr_op->reqs[i] = NULL;
+                    if (curr_op->multi_reqs[i] != NULL) {
+                        MPID_Request_release(curr_op->multi_reqs[i]);
+                        curr_op->multi_reqs[i] = NULL;
                         win_ptr->active_req_cnt--;
                     }
                 }
 
                 /* free req array in this op */
-                MPIU_Free(curr_op->reqs);
-                curr_op->reqs = NULL;
+                MPIU_Free(curr_op->multi_reqs);
+                curr_op->multi_reqs = NULL;
                 curr_op->reqs_size = 0;
             }
             MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, op_list_head, op_list_tail, curr_op);

http://git.mpich.org/mpich.git/commitdiff/ede41471c48cd9b5cd8f30e875b507503a090731

commit ede41471c48cd9b5cd8f30e875b507503a090731
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Thu Jun 11 15:45:49 2015 -0500

    Bug-fix on setting MPIDI_CH3I_num_active_issued_win.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index 72dbdb3..6b732b3 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -359,6 +359,8 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
                 MPID_Request_release(req_ptr);
                 win_ptr->fence_sync_req = MPI_REQUEST_NULL;
                 win_ptr->states.access_state = MPIDI_RMA_NONE;
+                MPIDI_CH3I_num_active_issued_win--;
+                MPIU_Assert(MPIDI_CH3I_num_active_issued_win >= 0);
             }
 
             if (win_ptr->shm_allocated == TRUE) {
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index d047a29..607d0f1 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -178,6 +178,8 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
         MPID_Request_release(req_ptr);
         (*win_ptr)->fence_sync_req = MPI_REQUEST_NULL;
         (*win_ptr)->states.access_state = MPIDI_RMA_NONE;
+        MPIDI_CH3I_num_active_issued_win--;
+        MPIU_Assert(MPIDI_CH3I_num_active_issued_win >= 0);
     }
 
     if ((*win_ptr)->states.access_state == MPIDI_RMA_FENCE_GRANTED)

http://git.mpich.org/mpich.git/commitdiff/9042b82822947297c021f281e10b394a265c1bf0

commit 9042b82822947297c021f281e10b394a265c1bf0
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Fri Jun 12 22:51:18 2015 -0500

    Delete poke_progress_engine in RMA sync calls and only keep self lock cases.
    
    Originally we poke the progress engine at the end of RMA sync calls
    if progress engine is never poked in this call before. The purpose
    of this is to prevent possible deadlock problem. However, the deadlock
    problem should only happen in self lock cases, if target is not myself,
    it add unnecessary overhead to RMA sync calls. In this patch, we delete
    those progress poking but only leave ones when target is myself.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index 3e92d74..72dbdb3 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -320,7 +320,6 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
     int local_completed = 0, remote_completed = 0;
     MPIDI_RMA_Target_t *curr_target = NULL;
     mpir_errflag_t errflag = MPIR_ERR_NONE;
-    int progress_engine_triggered = 0;
     int comm_size = win_ptr->comm_ptr->local_size;
     int scalable_fence_enabled = 0;
     int *rma_target_marks = NULL;
@@ -369,10 +368,6 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
                 if (mpi_errno != MPI_SUCCESS)
                     MPIU_ERR_POP(mpi_errno);
                 MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
-
-                /* Mark that we triggered the progress engine
-                 * in this function call. */
-                progress_engine_triggered = 1;
             }
 
             mpi_errno = MPIR_Ibarrier_impl(win_ptr->comm_ptr, &(win_ptr->fence_sync_req));
@@ -479,10 +474,6 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-
-            /* Mark that we triggered the progress engine
-             * in this function call. */
-            progress_engine_triggered = 1;
         }
     } while ((scalable_fence_enabled && !remote_completed) ||
              (!scalable_fence_enabled && !local_completed));
@@ -498,10 +489,6 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
             MPIU_ERR_POP(mpi_errno);
         MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
-        /* Mark that we triggered the progress engine
-         * in this function call. */
-        progress_engine_triggered = 1;
-
         /* Set window access state properly. */
         if (assert & MPI_MODE_NOSUCCEED) {
             win_ptr->states.access_state = MPIDI_RMA_NONE;
@@ -516,10 +503,6 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-
-            /* Mark that we triggered the progress engine
-             * in this function call. */
-            progress_engine_triggered = 1;
         }
 
         if (assert & MPI_MODE_NOSUCCEED) {
@@ -539,10 +522,6 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
                 if (mpi_errno != MPI_SUCCESS)
                     MPIU_ERR_POP(mpi_errno);
                 MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
-
-                /* Mark that we triggered the progress engine
-                 * in this function call. */
-                progress_engine_triggered = 1;
             }
         }
     }
@@ -562,21 +541,6 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
 
     MPIU_Assert(win_ptr->active_req_cnt == 0);
 
-    if (!(assert & MPI_MODE_NOPRECEDE)) {
-        if (!progress_engine_triggered) {
-            /* In some cases (e.g. target is myself, or process on SHM),
-             * this function call does not go through the progress engine.
-             * Therefore, it is possible that this process never process
-             * events coming from other processes. This may cause deadlock in
-             * applications where the program execution on this process depends
-             * on the happening of events from other processes. Here we poke
-             * the progress engine once to avoid such issue.  */
-            mpi_errno = poke_progress_engine();
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-        }
-    }
-
     /* Ensure ordering of load/store operations. */
     if (win_ptr->shm_allocated == TRUE) {
         OPA_read_write_barrier();
@@ -830,7 +794,6 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
     int local_completed = 0, remote_completed = 0;
     MPID_Comm *win_comm_ptr = win_ptr->comm_ptr;
     MPIDI_RMA_Target_t *curr_target;
-    int progress_engine_triggered = 0;
     int made_progress;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_COMPLETE);
 
@@ -851,10 +814,6 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-
-            /* Mark that we triggered the progress engine
-             * in this function call. */
-            progress_engine_triggered = 1;
         }
     }
 
@@ -903,10 +862,6 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-
-            /* Mark that we triggered the progress engine
-             * in this function call. */
-            progress_engine_triggered = 1;
         }
     } while (!local_completed);
 
@@ -932,19 +887,6 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
 
     MPIU_Assert(win_ptr->active_req_cnt == 0);
 
-    if (!progress_engine_triggered) {
-        /* In some cases (e.g. target is myself, or process on SHM),
-         * this function call does not go through the progress engine.
-         * Therefore, it is possible that this process never process
-         * events coming from other processes. This may cause deadlock in
-         * applications where the program execution on this process depends
-         * on the happening of events from other processes. Here we poke
-         * the progress engine once to avoid such issue.  */
-        mpi_errno = poke_progress_engine();
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
-    }
-
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_COMPLETE);
     return mpi_errno;
@@ -962,7 +904,6 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_Win_wait(MPID_Win * win_ptr)
 {
-    int progress_engine_triggered = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_WAIT);
 
@@ -976,29 +917,12 @@ int MPIDI_Win_wait(MPID_Win * win_ptr)
         mpi_errno = wait_progress_engine();
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
-
-        /* Mark that we triggered the progress engine
-         * in this function call. */
-        progress_engine_triggered = 1;
     }
 
   finish_wait:
     /* Set window exposure state properly. */
     win_ptr->states.exposure_state = MPIDI_RMA_NONE;
 
-    if (!progress_engine_triggered) {
-        /* In some cases (e.g. target is myself, or process on SHM),
-         * this function call does not go through the progress engine.
-         * Therefore, it is possible that this process never process
-         * events coming from other processes. This may cause deadlock in
-         * applications where the program execution on this process depends
-         * on the happening of events from other processes. Here we poke
-         * the progress engine once to avoid such issue.  */
-        mpi_errno = poke_progress_engine();
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
-    }
-
     /* Ensure ordering of load/store operations. */
     if (win_ptr->shm_allocated == TRUE) {
         OPA_read_write_barrier();
@@ -1177,7 +1101,6 @@ int MPIDI_Win_unlock(int dest, MPID_Win * win_ptr)
     int local_completed ATTRIBUTE((unused)) = 0, remote_completed = 0;
     MPIDI_RMA_Target_t *target = NULL;
     enum MPIDI_RMA_sync_types sync_flag;
-    int progress_engine_triggered = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_UNLOCK);
 
@@ -1231,14 +1154,23 @@ int MPIDI_Win_unlock(int dest, MPID_Win * win_ptr)
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-
-            /* Mark that we triggered the progress engine
-             * in this function call. */
-            progress_engine_triggered = 1;
         }
     } while (!remote_completed);
 
   finish_unlock:
+    if (win_ptr->comm_ptr->rank == dest) {
+        /* In some cases (e.g. target is myself),
+         * this function call does not go through the progress engine.
+         * Therefore, it is possible that this process never process
+         * events coming from other processes. This may cause deadlock in
+         * applications where the program execution on this process depends
+         * on the happening of events from other processes. Here we poke
+         * the progress engine once to avoid such issue.  */
+        mpi_errno = poke_progress_engine();
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+    }
+
     win_ptr->lock_epoch_count--;
     if (win_ptr->lock_epoch_count == 0) {
         /* Set window access state properly. */
@@ -1260,19 +1192,6 @@ int MPIDI_Win_unlock(int dest, MPID_Win * win_ptr)
             MPIU_ERR_POP(mpi_errno);
     }
 
-    if (!progress_engine_triggered) {
-        /* In some cases (e.g. target is myself, or process on SHM),
-         * this function call does not go through the progress engine.
-         * Therefore, it is possible that this process never process
-         * events coming from other processes. This may cause deadlock in
-         * applications where the program execution on this process depends
-         * on the happening of events from other processes. Here we poke
-         * the progress engine once to avoid such issue.  */
-        mpi_errno = poke_progress_engine();
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
-    }
-
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_UNLOCK);
     return mpi_errno;
@@ -1293,7 +1212,6 @@ int MPIDI_Win_flush(int dest, MPID_Win * win_ptr)
     int local_completed ATTRIBUTE((unused)) = 0, remote_completed = 0;
     int rank = win_ptr->comm_ptr->rank;
     MPIDI_RMA_Target_t *target = NULL;
-    int progress_engine_triggered = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FLUSH);
 
@@ -1352,22 +1270,12 @@ int MPIDI_Win_flush(int dest, MPID_Win * win_ptr)
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-
-            /* Mark that we triggered the progress engine
-             * in this function call. */
-            progress_engine_triggered = 1;
         }
     } while (!remote_completed);
 
   finish_flush:
-    if (target != NULL) {
-        /* ENDING synchronization: correctly decrement the following counters. */
-        win_ptr->accumulated_ops_cnt -= target->accumulated_ops_cnt;
-        target->accumulated_ops_cnt = 0;
-    }
-
-    if (!progress_engine_triggered) {
-        /* In some cases (e.g. target is myself, or process on SHM),
+    if (win_ptr->comm_ptr->rank == dest) {
+        /* In some cases (e.g. target is myself),
          * this function call does not go through the progress engine.
          * Therefore, it is possible that this process never process
          * events coming from other processes. This may cause deadlock in
@@ -1379,6 +1287,12 @@ int MPIDI_Win_flush(int dest, MPID_Win * win_ptr)
             MPIU_ERR_POP(mpi_errno);
     }
 
+    if (target != NULL) {
+        /* ENDING synchronization: correctly decrement the following counters. */
+        win_ptr->accumulated_ops_cnt -= target->accumulated_ops_cnt;
+        target->accumulated_ops_cnt = 0;
+    }
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FLUSH);
     return mpi_errno;
@@ -1399,7 +1313,6 @@ int MPIDI_Win_flush_local(int dest, MPID_Win * win_ptr)
     int local_completed = 0, remote_completed = 0;
     int rank = win_ptr->comm_ptr->rank;
     MPIDI_RMA_Target_t *target = NULL;
-    int progress_engine_triggered = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FLUSH_LOCAL);
 
@@ -1465,10 +1378,6 @@ int MPIDI_Win_flush_local(int dest, MPID_Win * win_ptr)
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-
-            /* Mark that we triggered the progress engine
-             * in this function call. */
-            progress_engine_triggered = 1;
         }
     } while ((target->sync.upgrade_flush_local && !remote_completed) ||
              (!target->sync.upgrade_flush_local && !local_completed));
@@ -1483,19 +1392,6 @@ int MPIDI_Win_flush_local(int dest, MPID_Win * win_ptr)
         target->accumulated_ops_cnt = 0;
     }
 
-    if (!progress_engine_triggered) {
-        /* In some cases (e.g. target is myself, or process on SHM),
-         * this function call does not go through the progress engine.
-         * Therefore, it is possible that this process never process
-         * events coming from other processes. This may cause deadlock in
-         * applications where the program execution on this process depends
-         * on the happening of events from other processes. Here we poke
-         * the progress engine once to avoid such issue.  */
-        mpi_errno = poke_progress_engine();
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
-    }
-
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FLUSH_LOCAL);
     return mpi_errno;
@@ -1602,7 +1498,6 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
     int local_completed = 0, remote_completed = 0;
     int rank = win_ptr->comm_ptr->rank;
     MPIDI_RMA_Target_t *curr_target = NULL;
-    int progress_engine_triggered = 0;
     enum MPIDI_RMA_sync_types sync_flag;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_UNLOCK_ALL);
@@ -1707,10 +1602,6 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-
-            /* Mark that we triggered the progress engine
-             * in this function call. */
-            progress_engine_triggered = 1;
         }
     } while (!remote_completed);
 
@@ -1736,19 +1627,6 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
 
     MPIU_Assert(win_ptr->active_req_cnt == 0);
 
-    if (!progress_engine_triggered) {
-        /* In some cases (e.g. target is myself, or process on SHM),
-         * this function call does not go through the progress engine.
-         * Therefore, it is possible that this process never process
-         * events coming from other processes. This may cause deadlock in
-         * applications where the program execution on this process depends
-         * on the happening of events from other processes. Here we poke
-         * the progress engine once to avoid such issue.  */
-        mpi_errno = poke_progress_engine();
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
-    }
-
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_UNLOCK_ALL);
     return mpi_errno;
@@ -1768,7 +1646,6 @@ int MPIDI_Win_flush_all(MPID_Win * win_ptr)
     int i, made_progress = 0;
     int local_completed = 0, remote_completed = 0;
     MPIDI_RMA_Target_t *curr_target = NULL;
-    int progress_engine_triggered = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPIDI_STATE_MPIDI_WIN_FLUSH_ALL);
 
@@ -1814,10 +1691,6 @@ int MPIDI_Win_flush_all(MPID_Win * win_ptr)
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-
-            /* Mark that we triggered the progress engine
-             * in this function call. */
-            progress_engine_triggered = 1;
         }
     } while (!remote_completed);
 
@@ -1827,19 +1700,6 @@ int MPIDI_Win_flush_all(MPID_Win * win_ptr)
 
     MPIU_Assert(win_ptr->active_req_cnt == 0);
 
-    if (!progress_engine_triggered) {
-        /* In some cases (e.g. target is myself, or process on SHM),
-         * this function call does not go through the progress engine.
-         * Therefore, it is possible that this process never process
-         * events coming from other processes. This may cause deadlock in
-         * applications where the program execution on this process depends
-         * on the happening of events from other processes. Here we poke
-         * the progress engine once to avoid such issue.  */
-        mpi_errno = poke_progress_engine();
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
-    }
-
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPIDI_STATE_MPIDI_WIN_FLUSH_ALL);
     return mpi_errno;
@@ -1861,7 +1721,6 @@ int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
     MPIDI_RMA_Target_t *curr_target = NULL;
     int enable_flush_local_cnt = 0, upgrade_flush_local_cnt = 0;
     int remote_completed_cnt = 0, local_completed_cnt = 0;
-    int progress_engine_triggered = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FLUSH_LOCAL_ALL);
 
@@ -1941,10 +1800,6 @@ int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-
-            /* Mark that we triggered the progress engine
-             * in this function call. */
-            progress_engine_triggered = 1;
         }
     } while (remote_completed_cnt < upgrade_flush_local_cnt ||
              local_completed_cnt < enable_flush_local_cnt);
@@ -1962,19 +1817,6 @@ int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
     /* ENDING synchronization: correctly decrement the following counter. */
     win_ptr->accumulated_ops_cnt = 0;
 
-    if (!progress_engine_triggered) {
-        /* In some cases (e.g. target is myself, or process on SHM),
-         * this function call does not go through the progress engine.
-         * Therefore, it is possible that this process never process
-         * events coming from other processes. This may cause deadlock in
-         * applications where the program execution on this process depends
-         * on the happening of events from other processes. Here we poke
-         * the progress engine once to avoid such issue.  */
-        mpi_errno = poke_progress_engine();
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
-    }
-
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FLUSH_LOCAL_ALL);
     return mpi_errno;

http://git.mpich.org/mpich.git/commitdiff/97569a1ac64cbc23bcae3e3440f3dfaf58d61b83

commit 97569a1ac64cbc23bcae3e3440f3dfaf58d61b83
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Jun 15 09:09:56 2015 -0500

    Adding reduce-scatter based algorithm in MPI_Win_fence.
    
    In this patch, we add a reduce-scatter based algorithm in
    MPI_Win_fence, which is triggered when number of processes
    is at a small / medium value. When this algorithm is being
    used, memory usage is O(P), but the ending FENCE only needs
    to wait for local completion but does not need to wait for
    remote completion. When number of processes is large, we
    switch FENCE to the original barrier based algorithm, which
    has O(1) memory usage, but needs to wait for the remote
    completion in the ending FENCE.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index 6067a63..3e92d74 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -209,6 +209,30 @@
          PROC_SYNC with origin will see the latest data.
 */
 
+/*
+=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
+
+cvars:
+    - name        : MPIR_CVAR_CH3_RMA_SCALABLE_FENCE_PROCESS_NUM
+      category    : CH3
+      type        : int
+      default     : 1024
+      class       : none
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_ALL_EQ
+      description : >-
+          Specify the threshold of switching the algorithm used in
+          FENCE from the basic algorithm to the scalable algorithm.
+          The value can be nagative, zero or positive.
+          When the number of processes is larger than or equal to
+          this value, FENCE will use a scalable algorithm which do
+          not use O(P) data structure; when the number of processes
+          is smaller than the value, FENCE will use a basic but fast
+          algorithm which requires an O(P) data structure.
+
+=== END_MPI_T_CVAR_INFO_BLOCK ===
+*/
+
 MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_lockqueue_alloc);
 MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_winlock_getlocallock);
 MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_wincreate_allgather);
@@ -297,7 +321,11 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
     MPIDI_RMA_Target_t *curr_target = NULL;
     mpir_errflag_t errflag = MPIR_ERR_NONE;
     int progress_engine_triggered = 0;
+    int comm_size = win_ptr->comm_ptr->local_size;
+    int scalable_fence_enabled = 0;
+    int *rma_target_marks = NULL;
     int mpi_errno = MPI_SUCCESS;
+    MPIU_CHKLMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FENCE);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FENCE);
@@ -308,6 +336,11 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
                         win_ptr->states.exposure_state != MPIDI_RMA_NONE,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
+    /* Judge if we should switch to scalable FENCE algorithm */
+    if (comm_size >= MPIR_CVAR_CH3_RMA_SCALABLE_FENCE_PROCESS_NUM) {
+        scalable_fence_enabled = 1;
+    }
+
     /* Ensure ordering of load/store operations. */
     if (win_ptr->shm_allocated == TRUE) {
         OPA_read_write_barrier();
@@ -354,16 +387,80 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
         }
     }
 
-    /* Set sync_flag in target structs. */
-    for (i = 0; i < win_ptr->num_slots; i++) {
-        curr_target = win_ptr->slots[i].target_list_head;
-        while (curr_target != NULL) {
+    /* Perform basic algorithm by calling reduce-scatter */
+    if (!scalable_fence_enabled) {
+        /* If the IBARRIER is not completed, do not need to wait for
+         * it since we are going to call reduce-scatter */
+        if (win_ptr->fence_sync_req != MPI_REQUEST_NULL) {
+            MPID_Request *req_ptr;
+            MPID_Request_get_ptr(win_ptr->fence_sync_req, req_ptr);
+            MPID_Request_release(req_ptr);
+            win_ptr->fence_sync_req = MPI_REQUEST_NULL;
+            MPIDI_CH3I_num_active_issued_win--;
+            MPIU_Assert(MPIDI_CH3I_num_active_issued_win >= 0);
 
-            /* set sync_flag in sync struct */
-            if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
-                curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
+            win_ptr->states.access_state = MPIDI_RMA_NONE;
+        }
+        MPIU_CHKLMEM_MALLOC(rma_target_marks, int *, comm_size * sizeof(int),
+                            mpi_errno, "rma_target_marks");
+        for (i = 0; i < comm_size; i++)
+            rma_target_marks[i] = 0;
+
+        for (i = 0; i < win_ptr->num_slots; i++) {
+            curr_target = win_ptr->slots[i].target_list_head;
+            while (curr_target != NULL) {
+                rma_target_marks[curr_target->target_rank] = 1;
+                curr_target = curr_target->next;
+            }
+        }
+
+        win_ptr->at_completion_counter += comm_size;
+
+        mpi_errno = MPIR_Reduce_scatter_block_impl(MPI_IN_PLACE, rma_target_marks, 1,
+                                                   MPI_INT, MPI_SUM, win_ptr->comm_ptr, &errflag);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+
+        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+
+        win_ptr->at_completion_counter -= comm_size;
+        win_ptr->at_completion_counter += rma_target_marks[0];
+        MPIU_Assert(win_ptr->at_completion_counter >= 0);
+
+        win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
+    }
+
+    /* Set sync_flag in target structs. */
+    if (!scalable_fence_enabled) {
+        for (i = 0; i < win_ptr->num_slots; i++) {
+            curr_target = win_ptr->slots[i].target_list_head;
+            while (curr_target != NULL) {
+                if (curr_target->pending_op_list_head != NULL) {
+                    if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL) {
+                        curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
+                    }
+                    /* flag is set in order to decrement complete counter on target */
+                    curr_target->win_complete_flag = 1;
+                }
+                else {
+                    mpi_errno = send_decr_at_cnt_msg(curr_target->target_rank, win_ptr);
+                    if (mpi_errno != MPI_SUCCESS)
+                        MPIU_ERR_POP(mpi_errno);
+                }
+                curr_target = curr_target->next;
+            }
+        }
+    }
+    else {
+        for (i = 0; i < win_ptr->num_slots; i++) {
+            curr_target = win_ptr->slots[i].target_list_head;
+            while (curr_target != NULL) {
+                /* set sync_flag in sync struct */
+                if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
+                    curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
+                }
+                curr_target = curr_target->next;
             }
-            curr_target = curr_target->next;
         }
     }
 
@@ -372,12 +469,13 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
-    /* Wait for remote completion. */
+    /* Wait for local/remote completion. */
     do {
         mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed, &remote_completed);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
-        if (!remote_completed) {
+        if ((scalable_fence_enabled && !remote_completed) ||
+            (!scalable_fence_enabled && !local_completed)) {
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
@@ -386,28 +484,67 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
              * in this function call. */
             progress_engine_triggered = 1;
         }
-    } while (!remote_completed);
+    } while ((scalable_fence_enabled && !remote_completed) ||
+             (!scalable_fence_enabled && !local_completed));
 
     /* Cleanup all targets on window. */
     mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
-    mpi_errno = MPIR_Barrier_impl(win_ptr->comm_ptr, &errflag);
-    if (mpi_errno != MPI_SUCCESS)
-        MPIU_ERR_POP(mpi_errno);
-    MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+    if (scalable_fence_enabled) {
+        mpi_errno = MPIR_Barrier_impl(win_ptr->comm_ptr, &errflag);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
-    /* Mark that we triggered the progress engine
-     * in this function call. */
-    progress_engine_triggered = 1;
+        /* Mark that we triggered the progress engine
+         * in this function call. */
+        progress_engine_triggered = 1;
 
-    /* Set window access state properly. */
-    if (assert & MPI_MODE_NOSUCCEED) {
-        win_ptr->states.access_state = MPIDI_RMA_NONE;
+        /* Set window access state properly. */
+        if (assert & MPI_MODE_NOSUCCEED) {
+            win_ptr->states.access_state = MPIDI_RMA_NONE;
+        }
+        else {
+            win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
+        }
     }
     else {
-        win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
+        /* Waiting for all operations targeting at me to be finished. */
+        while (win_ptr->at_completion_counter) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+
+            /* Mark that we triggered the progress engine
+             * in this function call. */
+            progress_engine_triggered = 1;
+        }
+
+        if (assert & MPI_MODE_NOSUCCEED) {
+            win_ptr->states.access_state = MPIDI_RMA_NONE;
+        }
+        else {
+            /* Prepare for the next possible epoch */
+            mpi_errno = MPIR_Ibarrier_impl(win_ptr->comm_ptr, &(win_ptr->fence_sync_req));
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+            MPIDI_CH3I_num_active_issued_win++;
+            win_ptr->states.access_state = MPIDI_RMA_FENCE_ISSUED;
+
+            if (win_ptr->shm_allocated == TRUE) {
+                MPID_Comm *node_comm_ptr = win_ptr->comm_ptr->node_comm;
+                mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+                MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+
+                /* Mark that we triggered the progress engine
+                 * in this function call. */
+                progress_engine_triggered = 1;
+            }
+        }
     }
 
   finish_fence:
@@ -446,6 +583,7 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
     }
 
   fn_exit:
+    MPIU_CHKLMEM_FREEALL();
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FENCE);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */

http://git.mpich.org/mpich.git/commitdiff/bf7f2f7f67408098f74a411aee3aeb76acde5f04

commit bf7f2f7f67408098f74a411aee3aeb76acde5f04
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Thu Jun 11 13:28:52 2015 -0500

    Delete unnecessary progress in MPI_Win_fence.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index 69e8100..6067a63 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -354,18 +354,6 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
         }
     }
 
-    if (win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED) {
-        while (win_ptr->states.access_state != MPIDI_RMA_FENCE_GRANTED) {
-            mpi_errno = wait_progress_engine();
-            if (mpi_errno != MPI_SUCCESS)
-                MPIU_ERR_POP(mpi_errno);
-
-            /* Mark that we triggered the progress engine
-             * in this function call. */
-            progress_engine_triggered = 1;
-        }
-    }
-
     /* Set sync_flag in target structs. */
     for (i = 0; i < win_ptr->num_slots; i++) {
         curr_target = win_ptr->slots[i].target_list_head;

http://git.mpich.org/mpich.git/commitdiff/e9891eaeaa35d5ab5e5f407fcfbc11a3348e61cf

commit e9891eaeaa35d5ab5e5f407fcfbc11a3348e61cf
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Jun 15 09:07:37 2015 -0500

    Modify MPI_Win_complete to only wait for local completion.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_progress.c b/src/mpid/ch3/src/ch3u_rma_progress.c
index df180bf..a9daf45 100644
--- a/src/mpid/ch3/src/ch3u_rma_progress.c
+++ b/src/mpid/ch3/src/ch3u_rma_progress.c
@@ -345,8 +345,6 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
             /* piggyback on last OP. */
             if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
                 flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
-                if (target->win_complete_flag)
-                    flags |= MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER;
             }
             else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
                 flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK;
@@ -355,6 +353,8 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
                  * operation on out-of-order network). */
                 flags &= ~MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
             }
+            if (target->win_complete_flag)
+                flags |= MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER;
         }
 
         /* only increase ack counter when FLUSH or UNLOCK flag is set,
diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index c0f5689..69e8100 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -750,8 +750,8 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
 
         if (curr_target != NULL) {
             /* set sync_flag in sync struct */
-            if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
-                curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
+            if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL) {
+                curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
             }
             curr_target->win_complete_flag = 1;
         }
@@ -773,7 +773,7 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
         mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed, &remote_completed);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
-        if (!remote_completed) {
+        if (!local_completed) {
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
@@ -782,7 +782,7 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
              * in this function call. */
             progress_engine_triggered = 1;
         }
-    } while (!remote_completed);
+    } while (!local_completed);
 
     /* Cleanup all targets on this window. */
     mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);

http://git.mpich.org/mpich.git/commitdiff/06dbf44b8fe8cc313b5bab53e2cfd59b8085f718

commit 06dbf44b8fe8cc313b5bab53e2cfd59b8085f718
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Jun 21 10:22:05 2015 -0500

    Perf-tuning: overlapping issuing data and computation on target side in GACC/FOP.
    
    On target side, after we receive the GACC/FOP packet, we should
    first start sending back the data, then perform ACC computation.
    By doing this issuing data and computation can be overlapped.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index d214a0c..2f79b48 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -336,18 +336,6 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
         MPID_Segment_free(seg);
     }
 
-    /* accumulate data from tmp_buf into user_buf */
-    MPIU_Assert(predef_count == (int) predef_count);
-    mpi_errno = do_accumulate_op(rreq->dev.user_buf, (int) predef_count, basic_type,
-                                 rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype,
-                                 stream_offset, rreq->dev.op);
-
-    if (win_ptr->shm_allocated == TRUE)
-        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
-
     resp_req->dev.OnFinal = MPIDI_CH3_ReqHandler_GaccumSendComplete;
     resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumSendComplete;
     resp_req->dev.target_win_handle = rreq->dev.target_win_handle;
@@ -369,6 +357,18 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
 
     MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
 
+    /* accumulate data from tmp_buf into user_buf */
+    MPIU_Assert(predef_count == (int) predef_count);
+    mpi_errno = do_accumulate_op(rreq->dev.user_buf, (int) predef_count, basic_type,
+                                 rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype,
+                                 stream_offset, rreq->dev.op);
+
+    if (win_ptr->shm_allocated == TRUE)
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+
     /* Mark get portion as handled */
     rreq->dev.resp_request_handle = MPI_REQUEST_NULL;
 
@@ -471,16 +471,6 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, i
         MPID_Segment_free(seg);
     }
 
-    /* Perform accumulate computation */
-    mpi_errno = do_accumulate_op(rreq->dev.user_buf, 1, rreq->dev.datatype,
-                                 rreq->dev.real_user_buf, 1, rreq->dev.datatype, 0, rreq->dev.op);
-
-    if (win_ptr->shm_allocated == TRUE)
-        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
-
     /* Send back data */
     MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP);
     fop_resp_pkt->request_handle = rreq->dev.resp_request_handle;
@@ -505,6 +495,16 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, i
 
     MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
 
+    /* Perform accumulate computation */
+    mpi_errno = do_accumulate_op(rreq->dev.user_buf, 1, rreq->dev.datatype,
+                                 rreq->dev.real_user_buf, 1, rreq->dev.datatype, 0, rreq->dev.op);
+
+    if (win_ptr->shm_allocated == TRUE)
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+
     if (is_empty_origin == FALSE) {
         /* free the temporary buffer.
          * When origin data is zero, there
@@ -1342,20 +1342,6 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
             MPIU_ERR_POP(mpi_errno);
         }
 
-        /* Perform ACCUMULATE OP */
-
-        /* All data fits in packet header */
-        mpi_errno =
-            do_accumulate_op(get_accum_pkt->info.data, get_accum_pkt->count,
-                             get_accum_pkt->datatype, get_accum_pkt->addr, get_accum_pkt->count,
-                             get_accum_pkt->datatype, 0, get_accum_pkt->op);
-
-        if (win_ptr->shm_allocated == TRUE)
-            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
-
         /* here we increment the Active Target counter to guarantee the GET-like
          * operation are completed when counter reaches zero. */
         win_ptr->at_completion_counter++;
@@ -1371,6 +1357,20 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
             MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
         }
 
+        /* Perform ACCUMULATE OP */
+
+        /* All data fits in packet header */
+        mpi_errno =
+            do_accumulate_op(get_accum_pkt->info.data, get_accum_pkt->count,
+                             get_accum_pkt->datatype, get_accum_pkt->addr, get_accum_pkt->count,
+                             get_accum_pkt->datatype, 0, get_accum_pkt->op);
+
+        if (win_ptr->shm_allocated == TRUE)
+            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+
         goto fn_exit;
     }
 
@@ -1416,19 +1416,6 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
         MPID_Segment_free(seg);
     }
 
-    /* Perform ACCUMULATE OP */
-
-    MPIU_Assert(recv_count == (int) recv_count);
-    mpi_errno = do_accumulate_op(lock_entry->data, (int) recv_count, get_accum_pkt->datatype,
-                                 get_accum_pkt->addr, get_accum_pkt->count, get_accum_pkt->datatype,
-                                 0, get_accum_pkt->op);
-
-    if (win_ptr->shm_allocated == TRUE)
-        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-
-    if (mpi_errno != MPI_SUCCESS)
-        MPIU_ERR_POP(mpi_errno);
-
     /* here we increment the Active Target counter to guarantee the GET-like
      * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter++;
@@ -1456,6 +1443,19 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
         MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
 
+    /* Perform ACCUMULATE OP */
+
+    MPIU_Assert(recv_count == (int) recv_count);
+    mpi_errno = do_accumulate_op(lock_entry->data, (int) recv_count, get_accum_pkt->datatype,
+                                 get_accum_pkt->addr, get_accum_pkt->count, get_accum_pkt->datatype,
+                                 0, get_accum_pkt->op);
+
+    if (win_ptr->shm_allocated == TRUE)
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+
   fn_exit:
     return mpi_errno;
   fn_fail:
@@ -1560,22 +1560,6 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
         MPID_Segment_free(seg);
     }
 
-    /* Apply the op */
-    if (fop_pkt->type == MPIDI_CH3_PKT_FOP_IMMED) {
-        mpi_errno = do_accumulate_op(fop_pkt->info.data, 1, fop_pkt->datatype,
-                                     fop_pkt->addr, 1, fop_pkt->datatype, 0, fop_pkt->op);
-    }
-    else {
-        mpi_errno = do_accumulate_op(lock_entry->data, 1, fop_pkt->datatype,
-                                     fop_pkt->addr, 1, fop_pkt->datatype, 0, fop_pkt->op);
-    }
-
-    if (win_ptr->shm_allocated == TRUE)
-        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-
-    if (mpi_errno != MPI_SUCCESS)
-        MPIU_ERR_POP(mpi_errno);
-
     if (fop_pkt->type == MPIDI_CH3_PKT_FOP_IMMED) {
         /* send back the original data */
         MPIU_THREAD_CS_ENTER(CH3COMM, lock_entry->vc);
@@ -1619,6 +1603,22 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
         goto fn_exit;
     }
 
+    /* Apply the op */
+    if (fop_pkt->type == MPIDI_CH3_PKT_FOP_IMMED) {
+        mpi_errno = do_accumulate_op(fop_pkt->info.data, 1, fop_pkt->datatype,
+                                     fop_pkt->addr, 1, fop_pkt->datatype, 0, fop_pkt->op);
+    }
+    else {
+        mpi_errno = do_accumulate_op(lock_entry->data, 1, fop_pkt->datatype,
+                                     fop_pkt->addr, 1, fop_pkt->datatype, 0, fop_pkt->op);
+    }
+
+    if (win_ptr->shm_allocated == TRUE)
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+
     /* do final action */
     mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, TRUE /* has response data */ ,
                                     fop_pkt->flags, MPI_WIN_NULL);
@@ -1667,6 +1667,13 @@ static inline int perform_cas_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
 
     MPIU_Memcpy((void *) &cas_resp_pkt->info.data, cas_pkt->addr, len);
 
+    /* Send the response packet */
+    MPIU_THREAD_CS_ENTER(CH3COMM, lock_entry->vc);
+    mpi_errno = MPIDI_CH3_iStartMsg(lock_entry->vc, cas_resp_pkt, sizeof(*cas_resp_pkt), &send_req);
+    MPIU_THREAD_CS_EXIT(CH3COMM, lock_entry->vc);
+
+    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
     /* Compare and replace if equal */
     if (MPIR_Compare_equal(&cas_pkt->compare_data, cas_pkt->addr, cas_pkt->datatype)) {
         MPIU_Memcpy(cas_pkt->addr, &cas_pkt->origin_data, len);
@@ -1675,13 +1682,6 @@ static inline int perform_cas_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 
-    /* Send the response packet */
-    MPIU_THREAD_CS_ENTER(CH3COMM, lock_entry->vc);
-    mpi_errno = MPIDI_CH3_iStartMsg(lock_entry->vc, cas_resp_pkt, sizeof(*cas_resp_pkt), &send_req);
-    MPIU_THREAD_CS_EXIT(CH3COMM, lock_entry->vc);
-
-    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-
     if (send_req != NULL) {
         if (!MPID_Request_is_complete(send_req)) {
             /* sending process is not completed, set proper OnDataAvail
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 8dbabd1..ed0f646 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -906,18 +906,6 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             MPIU_ERR_POP(mpi_errno);
         }
 
-        /* perform accumulate operation. */
-        mpi_errno =
-            do_accumulate_op(get_accum_pkt->info.data, get_accum_pkt->count,
-                             get_accum_pkt->datatype, get_accum_pkt->addr, get_accum_pkt->count,
-                             get_accum_pkt->datatype, 0, get_accum_pkt->op);
-
-        if (win_ptr->shm_allocated == TRUE)
-            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-
-        if (mpi_errno)
-            MPIU_ERR_POP(mpi_errno);
-
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
         iovcnt = 1;
@@ -932,6 +920,18 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
         }
         /* --END ERROR HANDLING-- */
+
+        /* perform accumulate operation. */
+        mpi_errno =
+            do_accumulate_op(get_accum_pkt->info.data, get_accum_pkt->count,
+                             get_accum_pkt->datatype, get_accum_pkt->addr, get_accum_pkt->count,
+                             get_accum_pkt->datatype, 0, get_accum_pkt->op);
+
+        if (win_ptr->shm_allocated == TRUE)
+            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
     }
     else {
         MPIU_Assert(pkt->type == MPIDI_CH3_PKT_GET_ACCUM);
@@ -1195,6 +1195,13 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_Memcpy((void *) &cas_resp_pkt->info.data, cas_pkt->addr, len);
 
+    /* Send the response packet */
+    MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+    mpi_errno = MPIDI_CH3_iStartMsg(vc, cas_resp_pkt, sizeof(*cas_resp_pkt), &req);
+    MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+
+    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
     /* Compare and replace if equal */
     if (MPIR_Compare_equal(&cas_pkt->compare_data, cas_pkt->addr, cas_pkt->datatype)) {
         MPIU_Memcpy(cas_pkt->addr, &cas_pkt->origin_data, len);
@@ -1203,13 +1210,6 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 
-    /* Send the response packet */
-    MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-    mpi_errno = MPIDI_CH3_iStartMsg(vc, cas_resp_pkt, sizeof(*cas_resp_pkt), &req);
-    MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-
-    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-
     if (req != NULL) {
         if (!MPID_Request_is_complete(req)) {
             /* sending process is not completed, set proper OnDataAvail
@@ -1369,6 +1369,12 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             MPIU_ERR_POP(mpi_errno);
         }
 
+        /* send back the original data */
+        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+        mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_resp_pkt, sizeof(*fop_resp_pkt), &resp_req);
+        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+        MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
         /* Apply the op */
         mpi_errno = do_accumulate_op(fop_pkt->info.data, 1, fop_pkt->datatype,
                                      fop_pkt->addr, 1, fop_pkt->datatype, 0, fop_pkt->op);
@@ -1379,12 +1385,6 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
 
-        /* send back the original data */
-        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_resp_pkt, sizeof(*fop_resp_pkt), &resp_req);
-        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-        MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-
         if (resp_req != NULL) {
             if (!MPID_Request_is_complete(resp_req)) {
                 /* sending process is not completed, set proper OnDataAvail

http://git.mpich.org/mpich.git/commitdiff/139d85d502f89d81de4d88632b2cb315fc600769

commit 139d85d502f89d81de4d88632b2cb315fc600769
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Jun 21 17:45:55 2015 -0500

    Only make RMA progress when ISSUED active win or passive win exist.
    
    This optimization was missed in 7189bcde4875091fc35bfbec7faf2bb3cc78ee42.
    Here we add this back so that when there is no iSSUED active win
    or passive win, we ignore the while loop in RMA progress.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_progress.c b/src/mpid/ch3/src/ch3u_rma_progress.c
index 206ad50..df180bf 100644
--- a/src/mpid/ch3/src/ch3u_rma_progress.c
+++ b/src/mpid/ch3/src/ch3u_rma_progress.c
@@ -827,6 +827,9 @@ int MPIDI_CH3I_RMA_Make_progress_global(int *made_progress)
 
     (*made_progress) = 0;
 
+    if (MPIDI_CH3I_num_active_issued_win == 0 && MPIDI_CH3I_num_passive_win == 0)
+        goto fn_exit;
+
     for (win_elem = MPIDI_RMA_Win_list; win_elem; win_elem = win_elem->next) {
         int temp_progress = 0;
 

http://git.mpich.org/mpich.git/commitdiff/03b4a2034195c1d47be1b29a55513742915e763a

commit 03b4a2034195c1d47be1b29a55513742915e763a
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Wed Jun 24 23:26:55 2015 -0500

    Bug-fix on mpi/rma/manyget.c test.
    
    Originally the arguments passed to MPI_Win_create in this test
    was wrong. This patch fixed this issue.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/test/mpi/rma/manyget.c b/test/mpi/rma/manyget.c
index 176cc18..bf482e5 100644
--- a/test/mpi/rma/manyget.c
+++ b/test/mpi/rma/manyget.c
@@ -34,7 +34,7 @@ int main(int argc, char *argv[])
     }
 
     if (rank == 0)
-        MPI_Win_create(buf, sizeof(int), BUFSIZE / sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD,
+        MPI_Win_create(buf, BUFSIZE, sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD,
                        &win);
     else
         MPI_Win_create(MPI_BOTTOM, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win);

-----------------------------------------------------------------------

Summary of changes:
 src/mpi/coll/ibarrier.c                            |   24 +-
 .../ch3/channels/nemesis/include/mpid_nem_inline.h |    3 +
 src/mpid/ch3/include/mpid_rma_issue.h              |  692 ++++++++-----------
 src/mpid/ch3/include/mpid_rma_lockqueue.h          |   37 +-
 src/mpid/ch3/include/mpid_rma_oplist.h             |  106 ++--
 src/mpid/ch3/include/mpid_rma_types.h              |   34 +-
 src/mpid/ch3/include/mpidimpl.h                    |    4 +-
 src/mpid/ch3/include/mpidpkt.h                     |   78 ++-
 src/mpid/ch3/include/mpidpre.h                     |   28 +-
 src/mpid/ch3/include/mpidrma.h                     |  116 ++--
 src/mpid/ch3/src/ch3u_handle_recv_pkt.c            |    4 +-
 src/mpid/ch3/src/ch3u_handle_recv_req.c            |  439 +++++++------
 src/mpid/ch3/src/ch3u_request.c                    |    3 +-
 src/mpid/ch3/src/ch3u_rma_ops.c                    |   59 --
 src/mpid/ch3/src/ch3u_rma_pkthandler.c             |  425 ++++++------
 src/mpid/ch3/src/ch3u_rma_progress.c               |  251 ++++++--
 src/mpid/ch3/src/ch3u_rma_sync.c                   |  733 ++++++++++----------
 src/mpid/ch3/src/mpid_rma.c                        |   39 +-
 src/mpid/ch3/src/mpidi_printf.c                    |    9 +-
 src/mpid/ch3/src/mpidi_rma.c                       |   35 +-
 test/mpi/rma/manyget.c                             |    2 +-
 21 files changed, 1621 insertions(+), 1500 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list