[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.1.3-111-ged20cd3

Service Account noreply at mpich.org
Mon Nov 3 23:06:29 CST 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  ed20cd3738d66c7baa65ccb15534559bb0598220 (commit)
       via  b682ec0e59f4e16d0212b4c2b8a1689d0b52ab2a (commit)
       via  cc63b3675fc0773cfe682570e69ba69d4bbbdaf0 (commit)
       via  0542e30407f86bda999cc307d894886bd1c563e2 (commit)
       via  7fbe72dd24c859380c77b5c58b63581f78188ec2 (commit)
       via  be3e5bdd9bb8ff1f2aac42074f5b361c3025f50a (commit)
       via  a42b916df8b9ee3b06cfdf05d252334e474abaa4 (commit)
       via  52c2fc112fe62c0280785c0c40f0885310875e41 (commit)
       via  e8d4c6d575ee69bcc8e342b7375ce8c061d3dc5b (commit)
       via  1c638a123cef60a81e9f7a91a8dbaa6968a9854c (commit)
       via  d129eed393874a2076dadf3d28d541b66a3af142 (commit)
       via  b73778ea104768f493165133ea4dc196c54c5d0c (commit)
       via  6578785d10e4e10fb0a46cf3f1f7ba731112f591 (commit)
       via  fe15ea266ef2e8a2d53565b88ab3ce4d4b9e02b6 (commit)
       via  c0094faadcede19c63a6d700e9230ead67780f06 (commit)
       via  bfbb10489eeff8fd251da576b613d153ffa33fe4 (commit)
       via  38b20e57086e7a0b87aa3fd5a4b29f793c0b789d (commit)
       via  257faca27b708b23716da17b4b799a268733f156 (commit)
       via  33d96690f53ce6bfa091ad9a36e299f1393f32aa (commit)
       via  5dd5515429c5f5fd9b7426f9f0956f16143e8aad (commit)
       via  7c1e12f0cfb0d4370375becf6c3cf4db9607323e (commit)
       via  41a365ec8806b5f53666d7693f5d50745e2458bf (commit)
       via  ab058906941ac3342deb1c7407ab9db88f6b7dfe (commit)
       via  ebee0b71021afeef13504a2ce38f76e2a555344b (commit)
       via  f91d46333ed6a414535780aeaee2075cf83c7ab3 (commit)
       via  f076f3febb5890cda36d2f1f06264c14495f4c4b (commit)
       via  7eac974fa6cd23ddf63457b0b964b3724fefa1f2 (commit)
       via  1d873639602a045120b35a960d9068ecc5b4647b (commit)
       via  079a516bf9233f5c8b560a99479e5bfb73f64a52 (commit)
       via  0f596c489a435d175af333f024195ca661b9b2d1 (commit)
       via  5dd8a0a4244d17cebebf7ddff4c687f87e13556d (commit)
      from  f4253c3872f18b226db416f11ed3db71ff66c620 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/ed20cd3738d66c7baa65ccb15534559bb0598220

commit ed20cd3738d66c7baa65ccb15534559bb0598220
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Nov 3 20:31:05 2014 -0600

    add original RMA PVARs back.
    
    Add some original RMA PVARs back to the new
    RMA infrastructure, including timing of packet
    handlers, op allocation and setting, window
    creation, etc.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
index b28d50c..c786dcb 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
@@ -14,6 +14,8 @@
 #define MPIDI_CH3_PAGESIZE_MASK (~(MPIDI_CH3_PAGESIZE-1))
 #define MPIDI_CH3_ROUND_UP_PAGESIZE(x) ((((MPI_Aint)x)+(~MPIDI_CH3_PAGESIZE_MASK)) & MPIDI_CH3_PAGESIZE_MASK)
 
+MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_wincreate_allgather);
+
 MPIDI_SHM_Wins_list_t shm_wins_list;
 
 static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *info, MPID_Comm *comm_ptr,
@@ -315,6 +317,7 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
     node_size = node_comm_ptr->local_size;
     node_rank = node_comm_ptr->rank;
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
     /* allocate memory for the base addresses, disp_units, and
        completion counters of all processes */
     MPIU_CHKPMEM_MALLOC((*win_ptr)->base_addrs, void **,
@@ -347,6 +350,7 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
     mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                     tmp_buf, 3 * sizeof(MPI_Aint), MPI_BYTE,
                                     (*win_ptr)->comm_ptr, &errflag);
+    MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 6186377..d73273f 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -21,6 +21,8 @@ extern struct MPIDI_RMA_Target *global_rma_target_pool, *global_rma_target_pool_
 extern int num_active_issued_win;
 extern int num_passive_win;
 
+MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_alloc);
+
 /* MPIDI_CH3I_Win_op_alloc(): get a new op element from op pool and
  * initialize it. If we cannot get one, return NULL. */
 #undef FUNCNAME
@@ -534,7 +536,9 @@ static inline int MPIDI_CH3I_Win_get_op(MPID_Win * win_ptr, MPIDI_RMA_Op_t **e)
     int mpi_errno = MPI_SUCCESS;
 
     while (1) {
+        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_alloc);
         new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
+        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_alloc);
         if (new_ptr != NULL) break;
 
         mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr,
@@ -542,7 +546,9 @@ static inline int MPIDI_CH3I_Win_get_op(MPID_Win * win_ptr, MPIDI_RMA_Op_t **e)
                                                    &remote_completed);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
+        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_alloc);
         new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
+        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_alloc);
         if (new_ptr != NULL) break;
 
         if (MPIDI_RMA_Pkt_orderings->flush_remote) {
@@ -550,7 +556,9 @@ static inline int MPIDI_CH3I_Win_get_op(MPID_Win * win_ptr, MPIDI_RMA_Op_t **e)
             if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
         }
 
+        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_alloc);
         new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
+        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_alloc);
         if (new_ptr != NULL) break;
 
         mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr);
diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index 3ff14e9..ff3cb17 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -1225,7 +1225,8 @@ void *MPIDI_Alloc_mem(size_t size, MPID_Info *info_ptr);
 int MPIDI_Free_mem(void *ptr);
 
 /* Pvars */
-void MPIDI_CH3_RMA_Init_Pvars(void);
+void MPIDI_CH3_RMA_Init_sync_pvars(void);
+void MPIDI_CH3_RMA_Init_pkthandler_pvars(void);
 
 /* internal */
 int MPIDI_CH3I_Release_lock(MPID_Win * win_ptr);
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index 7848b79..a939ed5 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -12,6 +12,8 @@
 #include "mpid_rma_shm.h"
 #include "mpid_rma_issue.h"
 
+MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_lockqueue_alloc);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_winlock_getlocallock);
 
 #undef FUNCNAME
 #define FUNCNAME send_lock_msg
@@ -226,7 +228,9 @@ static inline int enqueue_lock_origin(MPID_Win *win_ptr, MPIDI_CH3_Pkt_t *pkt)
     MPIDI_Win_lock_queue *new_ptr = NULL;
     int mpi_errno = MPI_SUCCESS;
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_lockqueue_alloc);
     new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
+    MPIR_T_PVAR_TIMER_END(RMA, rma_lockqueue_alloc);
     if (!new_ptr) {
         MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
                              "MPIDI_Win_lock_queue");
@@ -278,6 +282,8 @@ static inline int acquire_local_lock(MPID_Win * win_ptr, int lock_type)
     MPIDI_STATE_DECL(MPID_STATE_ACQUIRE_LOCAL_LOCK);
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_ACQUIRE_LOCAL_LOCK);
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_winlock_getlocallock);
+
     if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, lock_type) == 1) {
         mpi_errno = set_lock_sync_counter(win_ptr, win_ptr->comm_ptr->rank);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
@@ -296,6 +302,7 @@ static inline int acquire_local_lock(MPID_Win * win_ptr, int lock_type)
     }
 
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_winlock_getlocallock);
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ACQUIRE_LOCAL_LOCK);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index f915180..eb832cf 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -6,6 +6,8 @@
 
 #include "mpidrma.h"
 
+MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_set);
+
 #define MPIDI_PASSIVE_TARGET_DONE_TAG  348297
 #define MPIDI_PASSIVE_TARGET_RMA_TAG 563924
 
@@ -98,6 +100,8 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
+        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
+
         put_pkt = &(new_ptr->pkt.put);
         MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
         put_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
@@ -116,10 +120,14 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
         new_ptr->origin_datatype = origin_datatype;
         new_ptr->target_rank = target_rank;
 
+        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
+
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
 
+        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
+
         /* if source or target datatypes are derived, increment their
          * reference counts */
         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
@@ -160,6 +168,8 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
             }
         }
 
+        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
+
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
@@ -263,6 +273,8 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
+        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
+
         get_pkt = &(new_ptr->pkt.get);
         MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
         get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
@@ -280,10 +292,14 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
         new_ptr->origin_datatype = origin_datatype;
         new_ptr->target_rank = target_rank;
 
+        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
+
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
 
+        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
+
         /* if source or target datatypes are derived, increment their
          * reference counts */
         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
@@ -301,6 +317,8 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
             new_ptr->piggyback_lock_candidate = 1;
         }
 
+        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
+
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
@@ -405,6 +423,8 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
+        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
+
         accum_pkt = &(new_ptr->pkt.accum);
 
         MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
@@ -424,10 +444,14 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
         new_ptr->origin_datatype = origin_datatype;
         new_ptr->target_rank = target_rank;
 
+        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
+
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
 
+        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
+
         /* if source or target datatypes are derived, increment their
          * reference counts */
         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
@@ -468,6 +492,8 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
             }
         }
 
+        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
+
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
@@ -576,6 +602,8 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
 
         /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */
 
+        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
+
         if (op == MPI_NO_OP) {
             /* Convert GAcc to a Get */
             MPIDI_CH3_Pkt_get_t *get_pkt = &(new_ptr->pkt.get);
@@ -678,6 +706,8 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
             }
         }
 
+        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
+
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
@@ -781,6 +811,8 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
+        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
+
         cas_pkt = &(new_ptr->pkt.cas);
         MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS);
         cas_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
@@ -800,6 +832,8 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         new_ptr->target_rank = target_rank;
         new_ptr->piggyback_lock_candidate = 1; /* CAS is always able to piggyback LOCK */
 
+        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
+
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
@@ -900,6 +934,8 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
+        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
+
         if (op == MPI_NO_OP) {
             /* Convert FOP to a Get */
             MPIDI_CH3_Pkt_get_t *get_pkt = &(new_ptr->pkt.get);
@@ -958,6 +994,8 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             }
         }
 
+        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
+
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index f292980..31caba3 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -7,6 +7,169 @@
 #include "mpidimpl.h"
 #include "mpidrma.h"
 
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_put);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_get);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_acc);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_get_accum);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_cas);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_fop);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_get_resp);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_get_accum_resp);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_cas_resp);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_fop_resp);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_lock);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_lock_granted);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_unlock);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_flush);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_flush_ack);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmapkt_decr_at_cnt);
+
+void MPIDI_CH3_RMA_Init_pkthandler_pvars(void)
+{
+    /* rma_rmapkt_put */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_put,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Put (in seconds)");
+
+    /* rma_rmapkt_get */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_get,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Get (in seconds)");
+
+    /* rma_rmapkt_acc */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_acc,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Accumulate (in seconds)");
+
+    /* rma_rmapkt_get_accum */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_get_accum,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Get-Accumulate (in seconds)");
+
+    /* rma_rmapkt_cas */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_cas,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Compare-and-swap (in seconds)");
+
+    /* rma_rmapkt_fop */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_fop,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Fetch-and-op (in seconds)");
+
+    /* rma_rmapkt_get_resp */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_get_resp,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Get response (in seconds)");
+
+    /* rma_rmapkt_get_accum_resp */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_get_accum_resp,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Get-Accumulate response (in seconds)");
+
+    /* rma_rmapkt_cas_resp */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_cas_resp,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Compare-and-Swap response (in seconds)");
+
+    /* rma_rmapkt_fop_resp */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_fop_resp,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Fetch-and-op response (in seconds)");
+
+    /* rma_rmapkt_lock */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_lock,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Lock (in seconds)");
+
+    /* rma_rmapkt_lock_granted */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_lock_granted,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Lock-Granted (in seconds)");
+
+    /* rma_rmapkt_unlock */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_unlock,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Unlock (in seconds)");
+
+    /* rma_rmapkt_flush */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_flush,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Flush (in seconds)");
+
+    /* rma_rmapkt_flush_ack */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_flush_ack,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Flush-Ack (in seconds)");
+
+    /* rma_rmapkt_decr_at_cnt */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmapkt_decr_at_cnt,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "RMA:PKTHANDLER for Decr-At-Cnt (in seconds)");
+}
 
 /* ------------------------------------------------------------------------ */
 /*
@@ -37,6 +200,8 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received put pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_put);
+
     MPIU_Assert(put_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(put_pkt->target_win_handle, win_ptr);
 
@@ -174,6 +339,7 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
 
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_put);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_PUT);
     return mpi_errno;
   fn_fail:
@@ -203,6 +369,8 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received get pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_get);
+
     MPIU_Assert(get_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(get_pkt->target_win_handle, win_ptr);
 
@@ -326,6 +494,7 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     }
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_get);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_GET);
     return mpi_errno;
   fn_fail:
@@ -356,6 +525,8 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received accumulate pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_acc);
+
     MPIU_Assert(accum_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(accum_pkt->target_win_handle, win_ptr);
 
@@ -500,6 +671,7 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     }
 
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_acc);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE);
     return mpi_errno;
   fn_fail:
@@ -531,6 +703,8 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received accumulate pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_get_accum);
+
     MPIU_Assert(get_accum_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(get_accum_pkt->target_win_handle, win_ptr);
 
@@ -677,6 +851,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     }
 
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_get_accum);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_GETACCUMULATE);
     return mpi_errno;
   fn_fail:
@@ -706,6 +881,8 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received CAS pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_cas);
+
     MPIU_Assert(cas_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(cas_pkt->target_win_handle, win_ptr);
 
@@ -789,6 +966,7 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     }
 
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_cas);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_CAS);
     return mpi_errno;
   fn_fail:
@@ -817,6 +995,8 @@ int MPIDI_CH3_PktHandler_CASResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received CAS response pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_cas_resp);
+
     MPID_Win_get_ptr(cas_resp_pkt->source_win_handle, win_ptr);
 
     /* decrement ack_counter on this target */
@@ -843,6 +1023,7 @@ int MPIDI_CH3_PktHandler_CASResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     *rreqp = NULL;
 
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_cas_resp);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_CASRESP);
     return mpi_errno;
   fn_fail:
@@ -870,6 +1051,8 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received FOP pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_fop);
+
     MPID_Win_get_ptr(fop_pkt->target_win_handle, win_ptr);
 
     (*buflen) = sizeof(MPIDI_CH3_Pkt_t);
@@ -951,6 +1134,7 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     }
 
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_fop);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOP);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
@@ -979,6 +1163,8 @@ int MPIDI_CH3_PktHandler_FOPResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received FOP response pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_fop_resp);
+
     MPID_Win_get_ptr(fop_resp_pkt->source_win_handle, win_ptr);
 
     /* Copy data to result buffer on orgin */
@@ -1004,6 +1190,7 @@ int MPIDI_CH3_PktHandler_FOPResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     *rreqp = NULL;
 
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_fop_resp);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOPRESP);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
@@ -1035,6 +1222,8 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received Get-Accumulate response pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_get_accum_resp);
+
     MPID_Win_get_ptr(get_accum_resp_pkt->source_win_handle, win_ptr);
 
     /* decrement ack_counter on target */
@@ -1071,6 +1260,7 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
 
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_get_accum_resp);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_GET_ACCUM_RESP);
     return mpi_errno;
   fn_fail:
@@ -1094,6 +1284,8 @@ int MPIDI_CH3_PktHandler_Lock(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received lock pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_lock);
+
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
 
     MPID_Win_get_ptr(lock_pkt->target_win_handle, win_ptr);
@@ -1111,6 +1303,7 @@ int MPIDI_CH3_PktHandler_Lock(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     *rreqp = NULL;
   fn_fail:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_lock);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCK);
     return mpi_errno;
 }
@@ -1139,6 +1332,8 @@ int MPIDI_CH3_PktHandler_GetResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received get response pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_get_resp);
+
     MPID_Win_get_ptr(get_resp_pkt->source_win_handle, win_ptr);
 
     /* decrement ack_counter on target */
@@ -1175,6 +1370,7 @@ int MPIDI_CH3_PktHandler_GetResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
 
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_get_resp);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_GETRESP);
     return mpi_errno;
   fn_fail:
@@ -1198,6 +1394,8 @@ int MPIDI_CH3_PktHandler_LockGranted(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received lock granted pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_lock_granted);
+
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
 
     MPID_Win_get_ptr(lock_granted_pkt->source_win_handle, win_ptr);
@@ -1208,6 +1406,7 @@ int MPIDI_CH3_PktHandler_LockGranted(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     *rreqp = NULL;
     MPIDI_CH3_Progress_signal_completion();
 
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_lock_granted);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKGRANTED);
  fn_exit:
     return MPI_SUCCESS;
@@ -1232,6 +1431,8 @@ int MPIDI_CH3_PktHandler_FlushAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received shared lock ops done pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_flush_ack);
+
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
 
     MPID_Win_get_ptr(flush_ack_pkt->source_win_handle, win_ptr);
@@ -1248,6 +1449,7 @@ int MPIDI_CH3_PktHandler_FlushAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     *rreqp = NULL;
     MPIDI_CH3_Progress_signal_completion();
 
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_flush_ack);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_FLUSHACK);
  fn_exit:
     return MPI_SUCCESS;
@@ -1271,6 +1473,8 @@ int MPIDI_CH3_PktHandler_DecrAtCnt(MPIDI_VC_t * vc ATTRIBUTE((unused)),
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_DECRATCNT);
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_decr_at_cnt);
+
     MPID_Win_get_ptr(decr_at_cnt_pkt->target_win_handle, win_ptr);
 
     win_ptr->at_completion_counter--;
@@ -1281,6 +1485,7 @@ int MPIDI_CH3_PktHandler_DecrAtCnt(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPIDI_CH3_Progress_signal_completion();
 
  fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_decr_at_cnt);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_DECRATCNT);
     return mpi_errno;
    fn_fail:
@@ -1304,6 +1509,8 @@ int MPIDI_CH3_PktHandler_Unlock(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_UNLOCK);
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received unlock pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_unlock);
+
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
     *rreqp = NULL;
 
@@ -1318,6 +1525,7 @@ int MPIDI_CH3_PktHandler_Unlock(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPIDI_CH3_Progress_signal_completion();
 
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_unlock);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_UNLOCK);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
@@ -1342,6 +1550,8 @@ int MPIDI_CH3_PktHandler_Flush(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_FLUSH);
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received flush pkt");
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_rmapkt_flush);
+
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
     *rreqp = NULL;
 
@@ -1352,6 +1562,7 @@ int MPIDI_CH3_PktHandler_Flush(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
   fn_exit:
+    MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_flush);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_FLUSH);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index 3a5a9f0..75a5e66 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -209,8 +209,59 @@
          PROC_SYNC with origin will see the latest data.
 */
 
-void MPIDI_CH3_RMA_Init_Pvars(void)
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_lockqueue_alloc);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_winlock_getlocallock);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_wincreate_allgather);
+
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmaqueue_alloc);
+MPIR_T_PVAR_DOUBLE_TIMER_DECL(RMA, rma_rmaqueue_set);
+
+void MPIDI_CH3_RMA_Init_sync_pvars(void)
 {
+    /* rma_lockqueue_alloc */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_lockqueue_alloc,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "Allocate Lock Queue element (in seconds)");
+
+    /* rma_winlock_getlocallock */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_winlock_getlocallock,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "WIN_LOCK:Get local lock (in seconds)");
+
+    /* rma_wincreate_allgather */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_wincreate_allgather,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "WIN_CREATE:Allgather (in seconds)");
+
+    /* rma_rmaqueue_alloc */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmaqueue_alloc,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "Allocate RMA Queue element (in seconds)");
+
+    /* rma_rmaqueue_set */
+    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
+                                      MPI_DOUBLE,
+                                      rma_rmaqueue_set,
+                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
+                                      MPI_T_BIND_NO_OBJECT,
+                                      MPIR_T_PVAR_FLAG_READONLY,
+                                      "RMA", "Set fields in RMA Queue element (in seconds)");
 }
 
 /* These are used to use a common routine to complete lists of RMA
diff --git a/src/mpid/ch3/src/ch3u_win_fns.c b/src/mpid/ch3/src/ch3u_win_fns.c
index eb9398b..7beaafe 100644
--- a/src/mpid/ch3/src/ch3u_win_fns.c
+++ b/src/mpid/ch3/src/ch3u_win_fns.c
@@ -8,6 +8,8 @@
 #include "mpiinfo.h"
 #include "mpidrma.h"
 
+MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_wincreate_allgather);
+
 #undef FUNCNAME
 #define FUNCNAME MPIDI_Win_fns_init
 #undef FCNAME
@@ -52,6 +54,7 @@ int MPIDI_CH3U_Win_create_gather( void *base, MPI_Aint size, int disp_unit,
     /* RMA handlers should be set before calling this function */
     mpi_errno = (*win_ptr)->RMAFns.Win_set_info(*win_ptr, info);
 
+    MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
     /* allocate memory for the base addresses, disp_units, and
        completion counters of all processes */
     MPIU_CHKPMEM_MALLOC((*win_ptr)->base_addrs, void **,
@@ -84,6 +87,7 @@ int MPIDI_CH3U_Win_create_gather( void *base, MPI_Aint size, int disp_unit,
     mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                     tmp_buf, 4, MPI_AINT,
                                     (*win_ptr)->comm_ptr, &errflag);
+    MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
     if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
     MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index 8846284..210be92 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -290,7 +290,8 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     if (initRMAoptions) {
         MPIU_THREADSAFE_INIT_BLOCK_BEGIN(initRMAoptions);
 
-        MPIDI_CH3_RMA_Init_Pvars();
+        MPIDI_CH3_RMA_Init_sync_pvars();
+        MPIDI_CH3_RMA_Init_pkthandler_pvars();
 
         MPIU_THREADSAFE_INIT_CLEAR(initRMAoptions);
         MPIU_THREADSAFE_INIT_BLOCK_END(initRMAoptions);

http://git.mpich.org/mpich.git/commitdiff/b682ec0e59f4e16d0212b4c2b8a1689d0b52ab2a

commit b682ec0e59f4e16d0212b4c2b8a1689d0b52ab2a
Author: Pavan Balaji <balaji at anl.gov>
Date:   Sun Jul 20 15:28:10 2014 -0500

    Remove namespacing for static functions and types.
    
    Names of static functions and types need not to have
    namespacing. Here we remove prefix MPIDI_CH3I_ for
    those functions and types.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_reqops.c b/src/mpid/ch3/src/ch3u_rma_reqops.c
index b6942a6..7d6904a 100644
--- a/src/mpid/ch3/src/ch3u_rma_reqops.c
+++ b/src/mpid/ch3/src/ch3u_rma_reqops.c
@@ -16,17 +16,17 @@ typedef struct {
     MPID_Request *request;
     MPID_Win *win_ptr;
     int target_rank;
-} MPIDI_CH3I_Rma_req_state_t;
+} req_state_t;
 
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_Rma_req_poll
+#define FUNCNAME req_poll
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPIDI_CH3I_Rma_req_poll(void *state, MPI_Status * status)
+static int req_poll(void *state, MPI_Status * status)
 {
     int mpi_errno = MPI_SUCCESS;
-    MPIDI_CH3I_Rma_req_state_t *req_state = (MPIDI_CH3I_Rma_req_state_t *) state;
+    req_state_t *req_state = (req_state_t *) state;
 
     MPIU_UNREFERENCED_ARG(status);
 
@@ -56,17 +56,17 @@ static int MPIDI_CH3I_Rma_req_poll(void *state, MPI_Status * status)
 
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_Rma_req_wait
+#define FUNCNAME req_wait
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPIDI_CH3I_Rma_req_wait(int count, void **states, double timeout, MPI_Status * status)
+static int req_wait(int count, void **states, double timeout, MPI_Status * status)
 {
     int mpi_errno = MPI_SUCCESS;
     int i;
 
     for (i = 0; i < count; i++) {
         /* Call poll to complete the operation */
-        mpi_errno = MPIDI_CH3I_Rma_req_poll(states[i], status);
+        mpi_errno = req_poll(states[i], status);
         if (mpi_errno != MPI_SUCCESS) {
             MPIU_ERR_POP(mpi_errno);
         }
@@ -80,10 +80,10 @@ static int MPIDI_CH3I_Rma_req_wait(int count, void **states, double timeout, MPI
 
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_Rma_req_query
+#define FUNCNAME req_query
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPIDI_CH3I_Rma_req_query(void *state, MPI_Status * status)
+static int req_query(void *state, MPI_Status * status)
 {
     int mpi_errno = MPI_SUCCESS;
 
@@ -104,10 +104,10 @@ static int MPIDI_CH3I_Rma_req_query(void *state, MPI_Status * status)
 
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_Rma_req_free
+#define FUNCNAME req_free
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPIDI_CH3I_Rma_req_free(void *state)
+static int req_free(void *state)
 {
     MPIU_Free(state);
 
@@ -116,10 +116,10 @@ static int MPIDI_CH3I_Rma_req_free(void *state)
 
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_Rma_req_cancel
+#define FUNCNAME req_cancel
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPIDI_CH3I_Rma_req_cancel(void *state, int complete)
+static int req_cancel(void *state, int complete)
 {
     int mpi_errno = MPI_SUCCESS;
 
@@ -151,7 +151,7 @@ int MPIDI_Rput(const void *origin_addr, int origin_count,
     MPID_Datatype *dtp;
     MPI_Aint dt_true_lb ATTRIBUTE((unused));
     MPIDI_msg_sz_t data_sz;
-    MPIDI_CH3I_Rma_req_state_t *req_state;
+    req_state_t *req_state;
     MPIDI_VC_t *orig_vc, *target_vc;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_RPUT);
@@ -165,8 +165,8 @@ int MPIDI_Rput(const void *origin_addr, int origin_count,
                         win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    MPIU_CHKPMEM_MALLOC(req_state, MPIDI_CH3I_Rma_req_state_t *,
-                        sizeof(MPIDI_CH3I_Rma_req_state_t), mpi_errno, "req-based RMA state");
+    MPIU_CHKPMEM_MALLOC(req_state, req_state_t *,
+                        sizeof(req_state_t), mpi_errno, "req-based RMA state");
 
     req_state->win_ptr = win_ptr;
     req_state->target_rank = target_rank;
@@ -193,10 +193,8 @@ int MPIDI_Rput(const void *origin_addr, int origin_count,
     if (target_rank == MPI_PROC_NULL || target_rank == win_ptr->comm_ptr->rank ||
         (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id) || data_sz == 0)
     {
-        mpi_errno = MPIR_Grequest_start_impl(MPIDI_CH3I_Rma_req_query,
-                                             MPIDI_CH3I_Rma_req_free,
-                                             MPIDI_CH3I_Rma_req_cancel,
-                                             req_state, &req_state->request);
+        mpi_errno = MPIR_Grequest_start_impl(req_query,
+                                             req_free, req_cancel, req_state, &req_state->request);
         if (mpi_errno != MPI_SUCCESS) {
             MPIU_ERR_POP(mpi_errno);
         }
@@ -204,12 +202,10 @@ int MPIDI_Rput(const void *origin_addr, int origin_count,
         MPIR_Grequest_complete_impl(req_state->request);
     }
     else {
-        mpi_errno = MPIX_Grequest_start_impl(MPIDI_CH3I_Rma_req_query,
-                                             MPIDI_CH3I_Rma_req_free,
-                                             MPIDI_CH3I_Rma_req_cancel,
-                                             MPIDI_CH3I_Rma_req_poll,
-                                             MPIDI_CH3I_Rma_req_wait,
-                                             req_state, &req_state->request);
+        mpi_errno = MPIX_Grequest_start_impl(req_query,
+                                             req_free,
+                                             req_cancel,
+                                             req_poll, req_wait, req_state, &req_state->request);
 
         if (mpi_errno != MPI_SUCCESS) {
             MPIU_ERR_POP(mpi_errno);
@@ -241,7 +237,7 @@ int MPIDI_Rget(void *origin_addr, int origin_count,
     MPID_Datatype *dtp;
     MPI_Aint dt_true_lb ATTRIBUTE((unused));
     MPIDI_msg_sz_t data_sz;
-    MPIDI_CH3I_Rma_req_state_t *req_state;
+    req_state_t *req_state;
     MPIDI_VC_t *orig_vc, *target_vc;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_RGET);
@@ -255,8 +251,8 @@ int MPIDI_Rget(void *origin_addr, int origin_count,
                         win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    MPIU_CHKPMEM_MALLOC(req_state, MPIDI_CH3I_Rma_req_state_t *,
-                        sizeof(MPIDI_CH3I_Rma_req_state_t), mpi_errno, "req-based RMA state");
+    MPIU_CHKPMEM_MALLOC(req_state, req_state_t *,
+                        sizeof(req_state_t), mpi_errno, "req-based RMA state");
 
     req_state->win_ptr = win_ptr;
     req_state->target_rank = target_rank;
@@ -283,10 +279,8 @@ int MPIDI_Rget(void *origin_addr, int origin_count,
     if (target_rank == MPI_PROC_NULL || target_rank == win_ptr->comm_ptr->rank ||
         (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id) || data_sz == 0)
     {
-        mpi_errno = MPIR_Grequest_start_impl(MPIDI_CH3I_Rma_req_query,
-                                             MPIDI_CH3I_Rma_req_free,
-                                             MPIDI_CH3I_Rma_req_cancel,
-                                             req_state, &req_state->request);
+        mpi_errno = MPIR_Grequest_start_impl(req_query,
+                                             req_free, req_cancel, req_state, &req_state->request);
         if (mpi_errno != MPI_SUCCESS) {
             MPIU_ERR_POP(mpi_errno);
         }
@@ -294,12 +288,10 @@ int MPIDI_Rget(void *origin_addr, int origin_count,
         MPIR_Grequest_complete_impl(req_state->request);
     }
     else {
-        mpi_errno = MPIX_Grequest_start_impl(MPIDI_CH3I_Rma_req_query,
-                                             MPIDI_CH3I_Rma_req_free,
-                                             MPIDI_CH3I_Rma_req_cancel,
-                                             MPIDI_CH3I_Rma_req_poll,
-                                             MPIDI_CH3I_Rma_req_wait,
-                                             req_state, &req_state->request);
+        mpi_errno = MPIX_Grequest_start_impl(req_query,
+                                             req_free,
+                                             req_cancel,
+                                             req_poll, req_wait, req_state, &req_state->request);
 
         if (mpi_errno != MPI_SUCCESS) {
             MPIU_ERR_POP(mpi_errno);
@@ -331,7 +323,7 @@ int MPIDI_Raccumulate(const void *origin_addr, int origin_count,
     MPID_Datatype *dtp;
     MPI_Aint dt_true_lb ATTRIBUTE((unused));
     MPIDI_msg_sz_t data_sz;
-    MPIDI_CH3I_Rma_req_state_t *req_state;
+    req_state_t *req_state;
     MPIDI_VC_t *orig_vc, *target_vc;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_RACCUMULATE);
@@ -345,8 +337,8 @@ int MPIDI_Raccumulate(const void *origin_addr, int origin_count,
                         win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    MPIU_CHKPMEM_MALLOC(req_state, MPIDI_CH3I_Rma_req_state_t *,
-                        sizeof(MPIDI_CH3I_Rma_req_state_t), mpi_errno, "req-based RMA state");
+    MPIU_CHKPMEM_MALLOC(req_state, req_state_t *,
+                        sizeof(req_state_t), mpi_errno, "req-based RMA state");
 
     req_state->win_ptr = win_ptr;
     req_state->target_rank = target_rank;
@@ -373,10 +365,8 @@ int MPIDI_Raccumulate(const void *origin_addr, int origin_count,
     if (target_rank == MPI_PROC_NULL || target_rank == win_ptr->comm_ptr->rank ||
         (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id) || data_sz == 0)
     {
-        mpi_errno = MPIR_Grequest_start_impl(MPIDI_CH3I_Rma_req_query,
-                                             MPIDI_CH3I_Rma_req_free,
-                                             MPIDI_CH3I_Rma_req_cancel,
-                                             req_state, &req_state->request);
+        mpi_errno = MPIR_Grequest_start_impl(req_query,
+                                             req_free, req_cancel, req_state, &req_state->request);
         if (mpi_errno != MPI_SUCCESS) {
             MPIU_ERR_POP(mpi_errno);
         }
@@ -384,12 +374,10 @@ int MPIDI_Raccumulate(const void *origin_addr, int origin_count,
         MPIR_Grequest_complete_impl(req_state->request);
     }
     else {
-        mpi_errno = MPIX_Grequest_start_impl(MPIDI_CH3I_Rma_req_query,
-                                             MPIDI_CH3I_Rma_req_free,
-                                             MPIDI_CH3I_Rma_req_cancel,
-                                             MPIDI_CH3I_Rma_req_poll,
-                                             MPIDI_CH3I_Rma_req_wait,
-                                             req_state, &req_state->request);
+        mpi_errno = MPIX_Grequest_start_impl(req_query,
+                                             req_free,
+                                             req_cancel,
+                                             req_poll, req_wait, req_state, &req_state->request);
 
         if (mpi_errno != MPI_SUCCESS) {
             MPIU_ERR_POP(mpi_errno);
@@ -422,7 +410,7 @@ int MPIDI_Rget_accumulate(const void *origin_addr, int origin_count,
     MPID_Datatype *dtp;
     MPI_Aint dt_true_lb ATTRIBUTE((unused));
     MPIDI_msg_sz_t data_sz, trg_data_sz;
-    MPIDI_CH3I_Rma_req_state_t *req_state;
+    req_state_t *req_state;
     MPIDI_VC_t *orig_vc, *target_vc;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_RGET_ACCUMULATE);
@@ -436,8 +424,8 @@ int MPIDI_Rget_accumulate(const void *origin_addr, int origin_count,
                         win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    MPIU_CHKPMEM_MALLOC(req_state, MPIDI_CH3I_Rma_req_state_t *,
-                        sizeof(MPIDI_CH3I_Rma_req_state_t), mpi_errno, "req-based RMA state");
+    MPIU_CHKPMEM_MALLOC(req_state, req_state_t *,
+                        sizeof(req_state_t), mpi_errno, "req-based RMA state");
 
     req_state->win_ptr = win_ptr;
     req_state->target_rank = target_rank;
@@ -467,10 +455,8 @@ int MPIDI_Rget_accumulate(const void *origin_addr, int origin_count,
     if (target_rank == MPI_PROC_NULL || target_rank == win_ptr->comm_ptr->rank ||
         (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id) ||
         (data_sz == 0 && trg_data_sz == 0)) {
-        mpi_errno = MPIR_Grequest_start_impl(MPIDI_CH3I_Rma_req_query,
-                                             MPIDI_CH3I_Rma_req_free,
-                                             MPIDI_CH3I_Rma_req_cancel,
-                                             req_state, &req_state->request);
+        mpi_errno = MPIR_Grequest_start_impl(req_query,
+                                             req_free, req_cancel, req_state, &req_state->request);
         if (mpi_errno != MPI_SUCCESS) {
             MPIU_ERR_POP(mpi_errno);
         }
@@ -478,12 +464,10 @@ int MPIDI_Rget_accumulate(const void *origin_addr, int origin_count,
         MPIR_Grequest_complete_impl(req_state->request);
     }
     else {
-        mpi_errno = MPIX_Grequest_start_impl(MPIDI_CH3I_Rma_req_query,
-                                             MPIDI_CH3I_Rma_req_free,
-                                             MPIDI_CH3I_Rma_req_cancel,
-                                             MPIDI_CH3I_Rma_req_poll,
-                                             MPIDI_CH3I_Rma_req_wait,
-                                             req_state, &req_state->request);
+        mpi_errno = MPIX_Grequest_start_impl(req_query,
+                                             req_free,
+                                             req_cancel,
+                                             req_poll, req_wait, req_state, &req_state->request);
 
         if (mpi_errno != MPI_SUCCESS) {
             MPIU_ERR_POP(mpi_errno);

http://git.mpich.org/mpich.git/commitdiff/cc63b3675fc0773cfe682570e69ba69d4bbbdaf0

commit cc63b3675fc0773cfe682570e69ba69d4bbbdaf0
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Nov 2 22:51:21 2014 -0600

    Delete no longer needed code.
    
    We made a huge change to RMA infrastructure and
    a lot of old code can be droped, including separate
    handlers for lock-op-unlock, ACCUM_IMMED specific
    code, O(p) data structure code, code of lazy issuing,
    etc.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index e8869da..6186377 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -566,42 +566,6 @@ static inline int MPIDI_CH3I_Win_get_op(MPID_Win * win_ptr, MPIDI_RMA_Op_t **e)
 }
 
 
-/* Return nonzero if the RMA operations list is empty.
- */
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_RMA_Ops_isempty
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_RMA_Ops_isempty(MPIDI_RMA_Ops_list_t * list)
-{
-    return *list == NULL;
-}
-
-
-/* Return a pointer to the first element in the list.
- */
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_RMA_Ops_head
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline MPIDI_RMA_Op_t *MPIDI_CH3I_RMA_Ops_head(MPIDI_RMA_Ops_list_t * list)
-{
-    return *list;
-}
-
-
-/* Return a pointer to the last element in the list.
- */
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_RMA_Ops_tail
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline MPIDI_RMA_Op_t *MPIDI_CH3I_RMA_Ops_tail(MPIDI_RMA_Ops_list_t * list_tail)
-{
-    return (*list_tail);
-}
-
-
 /* Append an element to the tail of the RMA ops list
  *
  * @param IN    list      Pointer to the RMA ops list
@@ -618,38 +582,6 @@ static inline void MPIDI_CH3I_RMA_Ops_append(MPIDI_RMA_Ops_list_t * list, MPIDI_
 }
 
 
-/* Allocate a new element on the tail of the RMA operations list.
- *
- * @param IN    list      Pointer to the RMA ops list
- * @param OUT   new_ptr   Pointer to the element that was allocated
- * @return                MPI error class
- */
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_RMA_Ops_alloc_tail
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_RMA_Ops_alloc_tail(MPID_Win * win_ptr, MPIDI_RMA_Ops_list_t * list,
-                                                MPIDI_RMA_Ops_list_t * list_tail,
-                                                MPIDI_RMA_Op_t ** new_elem)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_RMA_Op_t *tmp_ptr;
-
-    mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &tmp_ptr);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
-    MPL_LL_APPEND(*list, *list_tail, tmp_ptr);
-
-    *new_elem = tmp_ptr;
-
-  fn_exit:
-    return mpi_errno;
-  fn_fail:
-    *new_elem = NULL;
-    goto fn_exit;
-}
-
-
 /* Unlink an element from the RMA ops list
  *
  * @param IN    list      Pointer to the RMA ops list
@@ -689,78 +621,4 @@ static inline void MPIDI_CH3I_RMA_Ops_free_elem(MPID_Win * win_ptr, MPIDI_RMA_Op
 }
 
 
-/* Free an element in the RMA operations list.
- *
- * @param IN    list      Pointer to the RMA ops list
- * @param INOUT curr_ptr  Pointer to the element to be freed.  Will be updated
- *                        to point to the element following the element that
- *                        was freed.
- */
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_RMA_Ops_free_and_next
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline void MPIDI_CH3I_RMA_Ops_free_and_next(MPID_Win * win_ptr, MPIDI_RMA_Ops_list_t * list,
-                                                    MPIDI_RMA_Ops_list_t * list_tail,
-                                                    MPIDI_RMA_Op_t ** curr_ptr)
-{
-    MPIDI_RMA_Op_t *next_ptr = (*curr_ptr)->next;
-
-    MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, list, list_tail, *curr_ptr);
-    *curr_ptr = next_ptr;
-}
-
-
-/* Free the entire RMA operations list.
- */
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_RMA_Ops_free
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline void MPIDI_CH3I_RMA_Ops_free(MPID_Win * win_ptr, MPIDI_RMA_Ops_list_t * list,
-                                           MPIDI_RMA_Ops_list_t * list_tail)
-{
-    MPIDI_RMA_Op_t *curr_ptr, *tmp_ptr;
-
-    MPL_LL_FOREACH_SAFE(*list, curr_ptr, tmp_ptr) {
-        MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, list, list_tail, curr_ptr);
-    }
-}
-
-
-/* Retrieve the RMA ops list pointer from the window.  This routine detects
- * whether we are in an active or passive target epoch and returns the correct
- * ops list; we use a shared list for active target and separate per-target
- * lists for passive target.
- */
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_RMA_Get_ops_list
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline MPIDI_RMA_Ops_list_t *MPIDI_CH3I_RMA_Get_ops_list(MPID_Win * win_ptr, int target)
-{
-    if (win_ptr->epoch_state == MPIDI_EPOCH_FENCE ||
-        win_ptr->epoch_state == MPIDI_EPOCH_START || win_ptr->epoch_state == MPIDI_EPOCH_PSCW) {
-        return &win_ptr->at_rma_ops_list;
-    }
-    else {
-        return &win_ptr->targets[target].rma_ops_list;
-    }
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3I_RMA_Get_ops_list
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline MPIDI_RMA_Ops_list_t *MPIDI_CH3I_RMA_Get_ops_list_tail(MPID_Win * win_ptr, int target)
-{
-    if (win_ptr->epoch_state == MPIDI_EPOCH_FENCE ||
-        win_ptr->epoch_state == MPIDI_EPOCH_START || win_ptr->epoch_state == MPIDI_EPOCH_PSCW) {
-        return &win_ptr->at_rma_ops_list_tail;
-    }
-    else {
-        return &win_ptr->targets[target].rma_ops_list_tail;
-    }
-}
-
 #endif /* MPID_RMA_OPLIST_H_INCLUDED */
diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index 2755266..3ff14e9 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -1230,9 +1230,6 @@ void MPIDI_CH3_RMA_Init_Pvars(void);
 /* internal */
 int MPIDI_CH3I_Release_lock(MPID_Win * win_ptr);
 int MPIDI_CH3I_Try_acquire_win_lock(MPID_Win * win_ptr, int requested_lock);
-int MPIDI_CH3_Start_rma_op_target(MPID_Win *win_ptr, MPIDI_CH3_Pkt_flags_t flags);
-int MPIDI_CH3_Finish_rma_op_target(MPIDI_VC_t *vc, MPID_Win *win_ptr, int is_rma_update,
-                                   MPIDI_CH3_Pkt_flags_t flags, MPI_Win source_win_handle);
 
 int MPIDI_CH3I_Progress_finalize(void);
 
@@ -1789,8 +1786,6 @@ int MPIDI_CH3_PktHandler_Accumulate( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
 				     MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_GetAccumulate( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
                                         MPIDI_msg_sz_t *, MPID_Request ** );
-int MPIDI_CH3_PktHandler_Accumulate_Immed( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
-				     MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_CAS( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
                               MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_CASResp( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
@@ -1817,12 +1812,6 @@ int MPIDI_CH3_PktHandler_FlushAck( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
 				    MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_DecrAtCnt( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
                                     MPIDI_msg_sz_t *, MPID_Request ** );
-int MPIDI_CH3_PktHandler_LockPutUnlock( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
-					MPIDI_msg_sz_t *, MPID_Request ** );
-int MPIDI_CH3_PktHandler_LockAccumUnlock( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
-					  MPIDI_msg_sz_t *, MPID_Request ** );
-int MPIDI_CH3_PktHandler_LockGetUnlock( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
-					MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_FlowCntlUpdate( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
 					 MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_Close( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
@@ -1905,8 +1894,6 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete( MPIDI_VC_t *,
 int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete( MPIDI_VC_t *,
                                                       MPID_Request *,
                                                       int * );
-int MPIDI_CH3_ReqHandler_SinglePutAccumComplete( MPIDI_VC_t *, MPID_Request *,
-						 int * );
 int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *,
 						   MPID_Request *, int * );
 /* Send Handlers */
diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index 9f12b38..25b890f 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -79,7 +79,6 @@ typedef enum {
     MPIDI_CH3_PKT_GET,
     MPIDI_CH3_PKT_ACCUMULATE,
     MPIDI_CH3_PKT_GET_ACCUM,
-    MPIDI_CH3_PKT_ACCUM_IMMED,  /* optimization for short accumulate */
     MPIDI_CH3_PKT_FOP,
     MPIDI_CH3_PKT_CAS,
     MPIDI_CH3_PKT_GET_RESP,
@@ -92,9 +91,6 @@ typedef enum {
     MPIDI_CH3_PKT_FLUSH,
     MPIDI_CH3_PKT_FLUSH_ACK,
     MPIDI_CH3_PKT_DECR_AT_COUNTER,
-    MPIDI_CH3_PKT_LOCK_PUT_UNLOCK,      /* optimization for single puts */
-    MPIDI_CH3_PKT_LOCK_GET_UNLOCK,      /* optimization for single gets */
-    MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK,    /* optimization for single accumulates */
     /* RMA Packets end here */
     MPIDI_CH3_PKT_FLOW_CNTL_UPDATE,     /* FIXME: Unused */
     MPIDI_CH3_PKT_CLOSE,
@@ -213,18 +209,6 @@ MPIDI_CH3_PKT_DEFS
         case (MPIDI_CH3_PKT_FOP):                                       \
             datatype_ = (pkt_).fop.datatype;                            \
             break;                                                      \
-        case (MPIDI_CH3_PKT_LOCK_PUT_UNLOCK):                           \
-            datatype_ = pkt_.lock_put_unlock.datatype;                  \
-            break;                                                      \
-        case (MPIDI_CH3_PKT_LOCK_GET_UNLOCK):                           \
-            datatype_ = pkt_.lock_get_unlock.datatype;                  \
-            break;                                                      \
-        case (MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK):                         \
-            datatype_ = pkt_.lock_accum_unlock.datatype;                \
-            break;                                                      \
-        case (MPIDI_CH3_PKT_ACCUM_IMMED):                               \
-            datatype_ = pkt_.accum_immed.datatype;                      \
-            break;                                                      \
         default:                                                        \
             MPIU_ERR_SETANDJUMP1(err_, MPI_ERR_OTHER, "**invalidpkt", "**invalidpkt %d", (pkt_).type); \
         }                                                               \
@@ -493,25 +477,6 @@ typedef struct MPIDI_CH3_Pkt_get_accum_resp {
     MPIDI_CH3_Pkt_flags_t flags;
 } MPIDI_CH3_Pkt_get_accum_resp_t;
 
-typedef struct MPIDI_CH3_Pkt_accum_immed {
-    MPIDI_CH3_Pkt_type_t type;
-    MPIDI_CH3_Pkt_flags_t flags;
-    void *addr;
-    int count;
-    /* FIXME: Compress datatype/op into a single word (immedate mode) */
-    MPI_Datatype datatype;
-    MPI_Op op;
-    /* FIXME: do we need these (use a regular accum packet if we do?) */
-    MPI_Win target_win_handle;  /* Used in the last RMA operation in each
-                                 * epoch for decrementing rma op counter in
-                                 * active target rma and for unlocking window
-                                 * in passive target rma. Otherwise set to NULL*/
-    MPI_Win source_win_handle;  /* Used in the last RMA operation in an
-                                 * epoch in the case of passive target rma
-                                 * with shared locks. Otherwise set to NULL*/
-    int data[MPIDI_RMA_IMMED_INTS];
-} MPIDI_CH3_Pkt_accum_immed_t;
-
 typedef struct MPIDI_CH3_Pkt_cas {
     MPIDI_CH3_Pkt_type_t type;
     MPIDI_CH3_Pkt_flags_t flags;
@@ -618,45 +583,6 @@ typedef struct MPIDI_CH3_Pkt_decr_at_counter {
     MPI_Win target_win_handle;
 } MPIDI_CH3_Pkt_decr_at_counter_t;
 
-typedef struct MPIDI_CH3_Pkt_lock_put_unlock {
-    MPIDI_CH3_Pkt_type_t type;
-    MPIDI_CH3_Pkt_flags_t flags;
-    MPI_Win target_win_handle;
-    MPI_Win source_win_handle;
-    int origin_rank;
-    int lock_type;
-    void *addr;
-    int count;
-    MPI_Datatype datatype;
-} MPIDI_CH3_Pkt_lock_put_unlock_t;
-
-typedef struct MPIDI_CH3_Pkt_lock_get_unlock {
-    MPIDI_CH3_Pkt_type_t type;
-    MPIDI_CH3_Pkt_flags_t flags;
-    MPI_Win target_win_handle;
-    MPI_Win source_win_handle;
-    int origin_rank;
-    int lock_type;
-    void *addr;
-    int count;
-    MPI_Datatype datatype;
-    MPI_Request request_handle;
-} MPIDI_CH3_Pkt_lock_get_unlock_t;
-
-typedef struct MPIDI_CH3_Pkt_lock_accum_unlock {
-    MPIDI_CH3_Pkt_type_t type;
-    MPIDI_CH3_Pkt_flags_t flags;
-    MPI_Win target_win_handle;
-    MPI_Win source_win_handle;
-    int origin_rank;
-    int lock_type;
-    void *addr;
-    int count;
-    MPI_Datatype datatype;
-    MPI_Op op;
-} MPIDI_CH3_Pkt_lock_accum_unlock_t;
-
-
 typedef struct MPIDI_CH3_Pkt_close {
     MPIDI_CH3_Pkt_type_t type;
     int ack;
@@ -685,7 +611,6 @@ typedef union MPIDI_CH3_Pkt {
     MPIDI_CH3_Pkt_get_t get;
     MPIDI_CH3_Pkt_get_resp_t get_resp;
     MPIDI_CH3_Pkt_accum_t accum;
-    MPIDI_CH3_Pkt_accum_immed_t accum_immed;
     MPIDI_CH3_Pkt_get_accum_t get_accum;
     MPIDI_CH3_Pkt_lock_t lock;
     MPIDI_CH3_Pkt_lock_granted_t lock_granted;
@@ -693,9 +618,6 @@ typedef union MPIDI_CH3_Pkt {
     MPIDI_CH3_Pkt_flush_t flush;
     MPIDI_CH3_Pkt_flush_ack_t flush_ack;
     MPIDI_CH3_Pkt_decr_at_counter_t decr_at_cnt;
-    MPIDI_CH3_Pkt_lock_put_unlock_t lock_put_unlock;
-    MPIDI_CH3_Pkt_lock_get_unlock_t lock_get_unlock;
-    MPIDI_CH3_Pkt_lock_accum_unlock_t lock_accum_unlock;
     MPIDI_CH3_Pkt_close_t close;
     MPIDI_CH3_Pkt_cas_t cas;
     MPIDI_CH3_Pkt_cas_resp_t cas_resp;
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index c46f98c..b193256 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -311,22 +311,6 @@ typedef struct MPIDI_RMA_Pkt_orderings {
 
 extern MPIDI_RMA_Pkt_orderings_t *MPIDI_RMA_Pkt_orderings;
 
-struct MPIDI_Win_target_state {
-    struct MPIDI_RMA_Op *rma_ops_list;
-                                /* List of outstanding RMA operations */
-    struct MPIDI_RMA_Op *rma_ops_list_tail;
-    volatile enum MPIDI_CH3_Lock_states remote_lock_state;
-                                /* Indicates the state of the target
-                                   process' "lock" for passive target
-                                   RMA. */
-    int remote_lock_mode;       /* Indicates the access mode
-                                   (shared/exclusive) of the target
-                                   process for passive target RMA. Valid
-                                   whenever state != NONE. */
-    int remote_lock_assert;     /* Assertion value provided in the call
-                                   to Lock */
-};
-
 #define MPIDI_DEV_WIN_DECL                                               \
     volatile int at_completion_counter;  /* completion counter for operations \
                                  targeting this window */                \
@@ -346,14 +330,6 @@ struct MPIDI_Win_target_state {
                                                                          \
     MPI_Aint *sizes;      /* array of sizes of all windows */            \
     struct MPIDI_Win_info_args info_args;                                \
-    struct MPIDI_Win_target_state *targets; /* Target state and ops      \
-                                               lists for passive target  \
-                                               mode of operation */      \
-    struct MPIDI_RMA_Op *at_rma_ops_list; /* Ops list for active target  \
-                                             mode of operation. */       \
-    struct MPIDI_RMA_Op *at_rma_ops_list_tail;                           \
-    enum MPIDI_Win_epoch_states epoch_state;                             \
-    int epoch_count;                                                     \
     int shm_allocated; /* flag: TRUE iff this window has a shared memory \
                           region associated with it */                   \
     struct MPIDI_RMA_Op *op_pool_start; /* start pointer used for freeing */\
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index a5c6578..7848b79 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -12,8 +12,6 @@
 #include "mpid_rma_shm.h"
 #include "mpid_rma_issue.h"
 
-int MPIDI_CH3I_Issue_rma_op(MPIDI_RMA_Op_t * op_ptr, MPID_Win * win_ptr,
-                            MPIDI_CH3_Pkt_flags_t flags);
 
 #undef FUNCNAME
 #define FUNCNAME send_lock_msg
@@ -29,8 +27,6 @@ static inline int send_lock_msg(int dest, int lock_type, MPID_Win * win_ptr)
     MPIDI_STATE_DECL(MPID_STATE_SEND_LOCK_MSG);
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_SEND_LOCK_MSG);
 
-    MPIU_Assert(win_ptr->targets[dest].remote_lock_state == MPIDI_CH3_WIN_LOCK_CALLED);
-
     MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr, dest, &vc);
 
     MPIDI_Pkt_init(lock_pkt, MPIDI_CH3_PKT_LOCK);
@@ -39,9 +35,6 @@ static inline int send_lock_msg(int dest, int lock_type, MPID_Win * win_ptr)
     lock_pkt->lock_type = lock_type;
     lock_pkt->origin_rank = win_ptr->comm_ptr->rank;
 
-    win_ptr->targets[dest].remote_lock_state = MPIDI_CH3_WIN_LOCK_REQUESTED;
-    win_ptr->targets[dest].remote_lock_mode = lock_type;
-
     MPIU_THREAD_CS_ENTER(CH3COMM, vc);
     mpi_errno = MPIDI_CH3_iStartMsg(vc, lock_pkt, sizeof(*lock_pkt), &req);
     MPIU_THREAD_CS_EXIT(CH3COMM, vc);
@@ -75,8 +68,6 @@ static inline int send_unlock_msg(int dest, MPID_Win * win_ptr)
     MPIDI_STATE_DECL(MPID_STATE_SEND_UNLOCK_MSG);
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_SEND_UNLOCK_MSG);
 
-    MPIU_Assert(win_ptr->targets[dest].remote_lock_state == MPIDI_CH3_WIN_LOCK_GRANTED);
-
     MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr, dest, &vc);
 
     /* Send a lock packet over to the target. wait for the lock_granted
@@ -86,9 +77,6 @@ static inline int send_unlock_msg(int dest, MPID_Win * win_ptr)
     unlock_pkt->target_win_handle = win_ptr->all_win_handles[dest];
     unlock_pkt->source_win_handle = win_ptr->handle;
 
-    /* Reset the local state of the target to unlocked */
-    win_ptr->targets[dest].remote_lock_state = MPIDI_CH3_WIN_LOCK_NONE;
-
     MPIU_THREAD_CS_ENTER(CH3COMM, vc);
     mpi_errno = MPIDI_CH3_iStartMsg(vc, unlock_pkt, sizeof(*unlock_pkt), &req);
     MPIU_THREAD_CS_EXIT(CH3COMM, vc);
@@ -307,9 +295,6 @@ static inline int acquire_local_lock(MPID_Win * win_ptr, int lock_type)
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
     }
 
-    win_ptr->targets[win_ptr->comm_ptr->rank].remote_lock_state = MPIDI_CH3_WIN_LOCK_GRANTED;
-    win_ptr->targets[win_ptr->comm_ptr->rank].remote_lock_mode = lock_type;
-
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ACQUIRE_LOCAL_LOCK);
     return mpi_errno;
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_pkt.c b/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
index dd91f39..35c4721 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
@@ -590,14 +590,6 @@ int MPIDI_CH3_PktHandler_Init( MPIDI_CH3_PktHandler_Fcn *pktArray[],
 	MPIDI_CH3_PktHandler_FlushAck;
     pktArray[MPIDI_CH3_PKT_DECR_AT_COUNTER] =
         MPIDI_CH3_PktHandler_DecrAtCnt;
-    pktArray[MPIDI_CH3_PKT_LOCK_PUT_UNLOCK] = 
-	MPIDI_CH3_PktHandler_LockPutUnlock;
-    pktArray[MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK] =
-	MPIDI_CH3_PktHandler_LockAccumUnlock;
-    pktArray[MPIDI_CH3_PKT_LOCK_GET_UNLOCK] = 
-	MPIDI_CH3_PktHandler_LockGetUnlock;
-    pktArray[MPIDI_CH3_PKT_ACCUM_IMMED] = 
-	MPIDI_CH3_PktHandler_Accumulate_Immed;
     pktArray[MPIDI_CH3_PKT_CAS] =
         MPIDI_CH3_PktHandler_CAS;
     pktArray[MPIDI_CH3_PKT_CAS_RESP] =
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 1eb93e2..44c8a71 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -8,8 +8,6 @@
 #include "mpidrma.h"
 
 static int create_derived_datatype(MPID_Request * rreq, MPID_Datatype ** dtp);
-static int do_simple_accumulate(MPIDI_PT_single_op *single_op);
-static int do_simple_get(MPID_Win *win_ptr, MPIDI_Win_lock_queue *lock_queue);
 
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3U_Handle_recv_req
@@ -117,9 +115,6 @@ int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *vc,
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIDI_CH3_Progress_signal_completion();
     }
-    mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
-                                               rreq->dev.source_win_handle);
-    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
 
     /* mark data transfer as complete and decrement CC */
     MPIDI_CH3U_Request_complete(rreq);
@@ -192,10 +187,6 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *vc,
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     }
 
-    mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
-                                               rreq->dev.source_win_handle);
-    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-
     /* mark data transfer as complete and decrement CC */
     MPIDI_CH3U_Request_complete(rreq);
     *complete = TRUE;
@@ -513,6 +504,7 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((
 }
 
 
+
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete
 #undef FCNAME
@@ -598,88 +590,6 @@ int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *vc,
     return mpi_errno;
 }
 
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_ReqHandler_SinglePutAccumComplete
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_SinglePutAccumComplete( MPIDI_VC_t *vc, 
-						 MPID_Request *rreq, 
-						 int *complete )
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPID_Win *win_ptr;
-    MPIDI_Win_lock_queue *lock_queue_entry, *curr_ptr, **curr_ptr_ptr;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_SINGLEPUTACCUMCOMPLETE);
-    
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_SINGLEPUTACCUMCOMPLETE);
-
-    /* received all the data for single lock-put(accum)-unlock 
-       optimization where the lock was not acquired in 
-       ch3u_handle_recv_pkt. Try to acquire the lock and do the 
-       operation. */
-    
-    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
-    
-    lock_queue_entry = rreq->dev.lock_queue_entry;
-    
-    if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, 
-					lock_queue_entry->lock_type) == 1)
-    {
-	
-	if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_PT_SINGLE_PUT) {
-	    /* copy the data over */
-	    mpi_errno = MPIR_Localcopy(rreq->dev.user_buf,
-				       rreq->dev.user_count,
-				       rreq->dev.datatype,
-				       lock_queue_entry->pt_single_op->addr,
-				       lock_queue_entry->pt_single_op->count,
-				       lock_queue_entry->pt_single_op->datatype);
-	}
-	else {
-	    if (win_ptr->shm_allocated == TRUE)
-		MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-	    mpi_errno = do_simple_accumulate(lock_queue_entry->pt_single_op);
-	    if (win_ptr->shm_allocated == TRUE)
-		MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-	}
-	
-	if (mpi_errno) {
-	    MPIU_ERR_POP(mpi_errno);
-	}
-	
-	/* free lock_queue_entry including data buffer and remove 
-	   it from the queue. */
-	curr_ptr = (MPIDI_Win_lock_queue *) win_ptr->lock_queue;
-	curr_ptr_ptr = (MPIDI_Win_lock_queue **) &(win_ptr->lock_queue);
-	while (curr_ptr != lock_queue_entry) {
-	    curr_ptr_ptr = &(curr_ptr->next);
-	    curr_ptr = curr_ptr->next;
-	}                    
-	*curr_ptr_ptr = curr_ptr->next;
-	
-        mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE,
-                                                   lock_queue_entry->pt_single_op->flags,
-                                                   lock_queue_entry->source_win_handle);
-
-	MPIU_Free(lock_queue_entry->pt_single_op->data);
-	MPIU_Free(lock_queue_entry->pt_single_op);
-	MPIU_Free(lock_queue_entry);
-	
-        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-    }
-    else {
-	/* could not acquire lock. mark data recd as 1 */
-	lock_queue_entry->pt_single_op->data_recd = 1;
-    }
-    
-    /* mark data transfer as complete and decrement CC */
-    MPIDI_CH3U_Request_complete(rreq);
-    *complete = TRUE;
- fn_fail:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_SINGLEPUTACCUMCOMPLETE);
-    return mpi_errno;
-}
-
 
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3_ReqHandler_UnpackUEBufComplete
@@ -1036,117 +946,3 @@ int MPIDI_CH3I_Release_lock(MPID_Win *win_ptr)
  fn_fail:
     goto fn_exit;
 }
-
-
-
-#undef FUNCNAME
-#define FUNCNAME do_simple_accumulate
-#undef FCNAME 
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int do_simple_accumulate(MPIDI_PT_single_op *single_op)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPI_User_function *uop;
-    MPIDI_STATE_DECL(MPID_STATE_DO_SIMPLE_ACCUMULATE);
-    
-    MPIDI_FUNC_ENTER(MPID_STATE_DO_SIMPLE_ACCUMULATE);
-
-    if (single_op->op == MPI_REPLACE)
-    {
-        /* simply copy the data */
-        mpi_errno = MPIR_Localcopy(single_op->data, single_op->count,
-                                   single_op->datatype, single_op->addr,
-                                   single_op->count, single_op->datatype);
-        if (mpi_errno) {
-	    MPIU_ERR_POP(mpi_errno);
-	}
-        goto fn_exit;
-    }
-
-    if (HANDLE_GET_KIND(single_op->op) == HANDLE_KIND_BUILTIN)
-    {
-        /* get the function by indexing into the op table */
-        uop = MPIR_OP_HDL_TO_FN(single_op->op);
-    }
-    else
-    {
-	/* --BEGIN ERROR HANDLING-- */
-        mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", single_op->op );
-        goto fn_fail;
-	/* --END ERROR HANDLING-- */
-    }
-    
-    /* only basic datatypes supported for this optimization. */
-    (*uop)(single_op->data, single_op->addr,
-           &(single_op->count), &(single_op->datatype));
-
- fn_fail:
- fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_DO_SIMPLE_ACCUMULATE);
-    return mpi_errno;
-}
-
-
-
-#undef FUNCNAME
-#define FUNCNAME do_simple_get
-#undef FCNAME 
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int do_simple_get(MPID_Win *win_ptr, MPIDI_Win_lock_queue *lock_queue)
-{
-    MPIDI_CH3_Pkt_t upkt;
-    MPIDI_CH3_Pkt_get_resp_t * get_resp_pkt = &upkt.get_resp;
-    MPID_Request *req;
-    MPID_IOV iov[MPID_IOV_LIMIT];
-    int mpi_errno=MPI_SUCCESS;
-    MPI_Aint type_size;
-    MPIDI_VC_t *vc;
-    MPIDI_STATE_DECL(MPID_STATE_DO_SIMPLE_GET);
-    
-    MPIDI_FUNC_ENTER(MPID_STATE_DO_SIMPLE_GET);
-
-    req = MPID_Request_create();
-    if (req == NULL) {
-        MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomemreq");
-    }
-    req->dev.target_win_handle = win_ptr->handle;
-    req->dev.source_win_handle = lock_queue->source_win_handle;
-    req->dev.flags = lock_queue->pt_single_op->flags;
-    
-    MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_RESP); 
-    req->kind = MPID_REQUEST_SEND;
-    req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetSendRespComplete;
-    req->dev.OnFinal     = MPIDI_CH3_ReqHandler_GetSendRespComplete;
-
-    /* here we increment the Active Target counter to guarantee the GET-like
-       operation are completed when counter reaches zero. */
-    win_ptr->at_completion_counter++;
-    
-    MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
-    get_resp_pkt->request_handle = lock_queue->pt_single_op->request_handle;
-    
-    iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
-    iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
-    
-    iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)lock_queue->pt_single_op->addr;
-    MPID_Datatype_get_size_macro(lock_queue->pt_single_op->datatype, type_size);
-    iov[1].MPID_IOV_LEN = lock_queue->pt_single_op->count * type_size;
-    
-    MPIDI_Comm_get_vc(win_ptr->comm_ptr, lock_queue->origin_rank, &vc);
-    /* Because this is in a packet handler, it is already within a critical section */	
-    /* MPIU_THREAD_CS_ENTER(CH3COMM,vc); */
-    mpi_errno = MPIDI_CH3_iSendv(vc, req, iov, 2);
-    /* MPIU_THREAD_CS_EXIT(CH3COMM,vc); */
-    /* --BEGIN ERROR HANDLING-- */
-    if (mpi_errno != MPI_SUCCESS)
-    {
-        MPID_Request_release(req);
-	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|rmamsg");
-    }
-    /* --END ERROR HANDLING-- */
-
- fn_fail:
-    MPIDI_FUNC_EXIT(MPID_STATE_DO_SIMPLE_GET);
-
-    return mpi_errno;
-}
diff --git a/src/mpid/ch3/src/ch3u_handle_send_req.c b/src/mpid/ch3/src/ch3u_handle_send_req.c
index 281be1c..07ecad2 100644
--- a/src/mpid/ch3/src/ch3u_handle_send_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_send_req.c
@@ -65,9 +65,6 @@ int MPIDI_CH3_ReqHandler_GetSendComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
             MPIDI_CH3_Progress_signal_completion();
     }
 
-    mpi_errno = MPIDI_CH3_Finish_rma_op_target(NULL, win_ptr, FALSE, sreq->dev.flags, MPI_WIN_NULL);
-    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-
     /* here we decrement the Active Target counter to guarantee the GET-like
        operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter--;
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index fab643a..42f37f1 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -11,16 +11,6 @@
 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
 
 cvars:
-    - name        : MPIR_CVAR_CH3_RMA_ACC_IMMED
-      category    : CH3
-      type        : boolean
-      default     : true
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_ALL_EQ
-      description : >-
-        Use the immediate accumulate optimization
-
     - name        : MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD
       category    : CH3
       type        : int
@@ -323,8 +313,7 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *targe
                                           &(target->dt_op_list_tail), curr_op);
             }
             else if (curr_op->pkt.type == MPIDI_CH3_PKT_PUT ||
-                     curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
-                     curr_op->pkt.type == MPIDI_CH3_PKT_ACCUM_IMMED) {
+                     curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE) {
                 MPIDI_CH3I_RMA_Ops_append(&(target->write_op_list),
                                           &(target->write_op_list_tail), curr_op);
             }
@@ -788,44 +777,3 @@ static int send_flush_msg(int dest, MPID_Win * win_ptr)
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
-
-
-int MPIDI_CH3I_Issue_rma_op(MPIDI_RMA_Op_t * op_ptr, MPID_Win * win_ptr,
-                            MPIDI_CH3_Pkt_flags_t flags)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_ISSUE_RMA_OP);
-
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_ISSUE_RMA_OP);
-
-    switch (op_ptr->pkt.type) {
-    case (MPIDI_CH3_PKT_PUT):
-    case (MPIDI_CH3_PKT_ACCUMULATE):
-    case (MPIDI_CH3_PKT_GET_ACCUM):
-        mpi_errno = send_rma_msg(op_ptr, win_ptr, flags);
-        break;
-    case (MPIDI_CH3_PKT_ACCUM_IMMED):
-        mpi_errno = send_contig_acc_msg(op_ptr, win_ptr, flags);
-        break;
-    case (MPIDI_CH3_PKT_GET):
-        mpi_errno = recv_rma_msg(op_ptr, win_ptr, flags);
-        break;
-    case (MPIDI_CH3_PKT_CAS):
-    case (MPIDI_CH3_PKT_FOP):
-        mpi_errno = send_immed_rmw_msg(op_ptr, win_ptr, flags);
-        break;
-    default:
-        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winInvalidOp");
-    }
-
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
-
-  fn_exit:
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_ISSUE_RMA_OP);
-    return mpi_errno;
-    /* --BEGIN ERROR HANDLING-- */
-  fn_fail:
-    goto fn_exit;
-    /* --END ERROR HANDLING-- */
-}
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 5a0b415..f915180 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -6,8 +6,6 @@
 
 #include "mpidrma.h"
 
-static int enableShortACC = 1;
-
 #define MPIDI_PASSIVE_TARGET_DONE_TAG  348297
 #define MPIDI_PASSIVE_TARGET_RMA_TAG 563924
 
@@ -407,40 +405,6 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-        /* If predefined and contiguous, use a simplified element */
-        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
-            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && enableShortACC) {
-            MPI_Aint origin_type_size;
-            size_t len;
-
-            MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
-            MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);
-            if (MPIR_CVAR_CH3_RMA_ACC_IMMED && len <= MPIDI_RMA_IMMED_INTS * sizeof(int)) {
-                MPIDI_CH3_Pkt_accum_immed_t *accumi_pkt;
-
-                accumi_pkt = &(new_ptr->pkt.accum_immed);
-                MPIDI_Pkt_init(accumi_pkt, MPIDI_CH3_PKT_ACCUM_IMMED);
-                accumi_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
-                    win_ptr->disp_units[target_rank] * target_disp;
-                accumi_pkt->count = target_count;
-                accumi_pkt->datatype = target_datatype;
-                accumi_pkt->op = op;
-                accumi_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
-                accumi_pkt->source_win_handle = win_ptr->handle;
-
-                new_ptr->origin_addr = (void *) origin_addr;
-                new_ptr->origin_count = origin_count;
-                new_ptr->origin_datatype = origin_datatype;
-                new_ptr->target_rank = target_rank;
-
-                mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
-                if (mpi_errno)
-                    MPIU_ERR_POP(mpi_errno);
-
-                goto issue_ops;
-            }
-        }
-
         accum_pkt = &(new_ptr->pkt.accum);
 
         MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
@@ -504,7 +468,6 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
             }
         }
 
- issue_ops:
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 2829d93..f292980 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -39,7 +39,6 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_Assert(put_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(put_pkt->target_win_handle, win_ptr);
-    mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, put_pkt->flags);
 
     data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
     data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
@@ -206,7 +205,6 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_Assert(get_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(get_pkt->target_win_handle, win_ptr);
-    mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, get_pkt->flags);
 
     mpi_errno = check_piggyback_lock(win_ptr, pkt, &acquire_lock_fail);
     if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
@@ -360,7 +358,6 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_Assert(accum_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(accum_pkt->target_win_handle, win_ptr);
-    mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, accum_pkt->flags);
 
     mpi_errno = check_piggyback_lock(win_ptr, pkt, &acquire_lock_fail);
     if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
@@ -536,7 +533,6 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_Assert(get_accum_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(get_accum_pkt->target_win_handle, win_ptr);
-    mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, get_accum_pkt->flags);
 
     mpi_errno = check_piggyback_lock(win_ptr, pkt, &acquire_lock_fail);
     if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
@@ -688,81 +684,6 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
 }
 
-/* Special accumulate for short data items entirely within the packet */
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_PktHandler_Accumulate_Immed
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_PktHandler_Accumulate_Immed(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
-                                          MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
-{
-    MPIDI_CH3_Pkt_accum_immed_t *accum_pkt = &pkt->accum_immed;
-    MPID_Win *win_ptr;
-    MPI_Aint extent;
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE_IMMED);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE_IMMED);
-
-    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received accumulate immedidate pkt");
-
-    MPIU_Assert(accum_pkt->target_win_handle != MPI_WIN_NULL);
-    MPID_Win_get_ptr(accum_pkt->target_win_handle, win_ptr);
-    mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, accum_pkt->flags);
-
-    /* return the number of bytes processed in this function */
-    /* data_len == 0 (all within packet) */
-    *buflen = sizeof(MPIDI_CH3_Pkt_t);
-    *rreqp = NULL;
-
-    MPID_Datatype_get_extent_macro(accum_pkt->datatype, extent);
-
-    if (win_ptr->shm_allocated == TRUE)
-        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-    /* Data is already present */
-    if (accum_pkt->op == MPI_REPLACE) {
-        /* no datatypes required */
-        int len;
-        MPIU_Assign_trunc(len, (accum_pkt->count * extent), int);
-        /* FIXME: use immediate copy because this is short */
-        MPIUI_Memcpy(accum_pkt->addr, accum_pkt->data, len);
-    }
-    else {
-        if (HANDLE_GET_KIND(accum_pkt->op) == HANDLE_KIND_BUILTIN) {
-            MPI_User_function *uop;
-            /* get the function by indexing into the op table */
-            uop = MPIR_OP_HDL_TO_FN(accum_pkt->op);
-            (*uop) (accum_pkt->data, accum_pkt->addr, &(accum_pkt->count), &(accum_pkt->datatype));
-        }
-        else {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OP, "**opnotpredefined",
-                                 "**opnotpredefined %d", accum_pkt->op);
-        }
-    }
-    if (win_ptr->shm_allocated == TRUE)
-        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-
-    /* There are additional steps to take if this is a passive
-     * target RMA or the last operation from the source */
-
-    /* Here is the code executed in PutAccumRespComplete after the
-     * accumulation operation */
-    MPID_Win_get_ptr(accum_pkt->target_win_handle, win_ptr);
-
-    mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE,
-                                               accum_pkt->flags, accum_pkt->source_win_handle);
-    if (mpi_errno) {
-        MPIU_ERR_POP(mpi_errno);
-    }
-
-  fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE_IMMED);
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-
-}
-
 
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3_PktHandler_CAS
@@ -787,7 +708,6 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_Assert(cas_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(cas_pkt->target_win_handle, win_ptr);
-    mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, cas_pkt->flags);
 
     /* return the number of bytes processed in this function */
     /* data_len == 0 (all within packet) */
@@ -868,14 +788,6 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             MPIDI_CH3_Progress_signal_completion();
     }
 
-    /* There are additional steps to take if this is a passive
-     * target RMA or the last operation from the source */
-
-    mpi_errno = MPIDI_CH3_Finish_rma_op_target(NULL, win_ptr, TRUE, cas_pkt->flags, MPI_WIN_NULL);
-    if (mpi_errno) {
-        MPIU_ERR_POP(mpi_errno);
-    }
-
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_CAS);
     return mpi_errno;
@@ -1203,368 +1115,6 @@ int MPIDI_CH3_PktHandler_Lock(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     return mpi_errno;
 }
 
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_PktHandler_LockPutUnlock
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_PktHandler_LockPutUnlock(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
-                                       MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
-{
-    MPIDI_CH3_Pkt_lock_put_unlock_t *lock_put_unlock_pkt = &pkt->lock_put_unlock;
-    MPID_Win *win_ptr = NULL;
-    MPID_Request *req = NULL;
-    MPI_Aint type_size;
-    int complete;
-    char *data_buf = NULL;
-    MPIDI_msg_sz_t data_len;
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKPUTUNLOCK);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKPUTUNLOCK);
-
-    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received lock_put_unlock pkt");
-
-    data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
-    data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
-
-    req = MPID_Request_create();
-    MPIU_Object_set_ref(req, 1);
-
-    req->dev.datatype = lock_put_unlock_pkt->datatype;
-    MPID_Datatype_get_size_macro(lock_put_unlock_pkt->datatype, type_size);
-    req->dev.recv_data_sz = type_size * lock_put_unlock_pkt->count;
-    req->dev.user_count = lock_put_unlock_pkt->count;
-    req->dev.target_win_handle = lock_put_unlock_pkt->target_win_handle;
-
-    MPID_Win_get_ptr(lock_put_unlock_pkt->target_win_handle, win_ptr);
-
-    if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, lock_put_unlock_pkt->lock_type) == 1) {
-        /* do the put. for this optimization, only basic datatypes supported. */
-        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RESP);
-        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutAccumRespComplete;
-        req->dev.user_buf = lock_put_unlock_pkt->addr;
-        req->dev.source_win_handle = lock_put_unlock_pkt->source_win_handle;
-        req->dev.flags = lock_put_unlock_pkt->flags;
-    }
-
-    else {
-        /* queue the information */
-        MPIDI_Win_lock_queue *curr_ptr, *prev_ptr, *new_ptr;
-
-        new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
-        if (!new_ptr) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                 "MPIDI_Win_lock_queue");
-        }
-
-        new_ptr->pt_single_op = (MPIDI_PT_single_op *) MPIU_Malloc(sizeof(MPIDI_PT_single_op));
-        if (new_ptr->pt_single_op == NULL) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                 "MPIDI_PT_single_op");
-        }
-
-        /* FIXME: MT: The queuing may need to be done atomically. */
-
-        curr_ptr = (MPIDI_Win_lock_queue *) win_ptr->lock_queue;
-        prev_ptr = curr_ptr;
-        while (curr_ptr != NULL) {
-            prev_ptr = curr_ptr;
-            curr_ptr = curr_ptr->next;
-        }
-
-        if (prev_ptr != NULL)
-            prev_ptr->next = new_ptr;
-        else
-            win_ptr->lock_queue = new_ptr;
-
-        new_ptr->next = NULL;
-        new_ptr->lock_type = lock_put_unlock_pkt->lock_type;
-        new_ptr->source_win_handle = lock_put_unlock_pkt->source_win_handle;
-        new_ptr->origin_rank = lock_put_unlock_pkt->origin_rank;
-
-        new_ptr->pt_single_op->type = MPIDI_CH3_PKT_LOCK_PUT_UNLOCK;
-        new_ptr->pt_single_op->flags = lock_put_unlock_pkt->flags;
-        new_ptr->pt_single_op->addr = lock_put_unlock_pkt->addr;
-        new_ptr->pt_single_op->count = lock_put_unlock_pkt->count;
-        new_ptr->pt_single_op->datatype = lock_put_unlock_pkt->datatype;
-        /* allocate memory to receive the data */
-        new_ptr->pt_single_op->data = MPIU_Malloc(req->dev.recv_data_sz);
-        if (new_ptr->pt_single_op->data == NULL) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                 req->dev.recv_data_sz);
-        }
-
-        new_ptr->pt_single_op->data_recd = 0;
-
-        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PT_SINGLE_PUT);
-        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_SinglePutAccumComplete;
-        req->dev.user_buf = new_ptr->pt_single_op->data;
-        req->dev.lock_queue_entry = new_ptr;
-    }
-
-    int (*fcn) (MPIDI_VC_t *, struct MPID_Request *, int *);
-    fcn = req->dev.OnDataAvail;
-    mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
-    if (mpi_errno != MPI_SUCCESS) {
-        MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
-                                  "**ch3|postrecv", "**ch3|postrecv %s",
-                                  "MPIDI_CH3_PKT_LOCK_PUT_UNLOCK");
-    }
-    req->dev.OnDataAvail = fcn;
-    *rreqp = req;
-
-    if (complete) {
-        mpi_errno = fcn(vc, req, &complete);
-        if (complete) {
-            *rreqp = NULL;
-        }
-    }
-
-    /* return the number of bytes processed in this function */
-    *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
-
-    if (mpi_errno != MPI_SUCCESS) {
-        MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
-                                  "**ch3|postrecv", "**ch3|postrecv %s",
-                                  "MPIDI_CH3_PKT_LOCK_PUT_UNLOCK");
-    }
-
-  fn_fail:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKPUTUNLOCK);
-    return mpi_errno;
-}
-
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_PktHandler_LockGetUnlock
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_PktHandler_LockGetUnlock(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
-                                       MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
-{
-    MPIDI_CH3_Pkt_lock_get_unlock_t *lock_get_unlock_pkt = &pkt->lock_get_unlock;
-    MPID_Win *win_ptr = NULL;
-    MPI_Aint type_size;
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKGETUNLOCK);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKGETUNLOCK);
-
-    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received lock_get_unlock pkt");
-
-    *buflen = sizeof(MPIDI_CH3_Pkt_t);
-
-    MPID_Win_get_ptr(lock_get_unlock_pkt->target_win_handle, win_ptr);
-
-    if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, lock_get_unlock_pkt->lock_type) == 1) {
-        /* do the get. for this optimization, only basic datatypes supported. */
-        MPIDI_CH3_Pkt_t upkt;
-        MPIDI_CH3_Pkt_get_resp_t *get_resp_pkt = &upkt.get_resp;
-        MPID_Request *req;
-        MPID_IOV iov[MPID_IOV_LIMIT];
-
-        req = MPID_Request_create();
-        req->dev.target_win_handle = lock_get_unlock_pkt->target_win_handle;
-        req->dev.source_win_handle = lock_get_unlock_pkt->source_win_handle;
-        req->dev.flags = lock_get_unlock_pkt->flags;
-
-        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_RESP);
-        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetSendRespComplete;
-        req->dev.OnFinal = MPIDI_CH3_ReqHandler_GetSendRespComplete;
-        req->kind = MPID_REQUEST_SEND;
-
-        /* here we increment the Active Target counter to guarantee the GET-like
-           operation are completed when counter reaches zero. */
-        win_ptr->at_completion_counter++;
-
-        MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
-        get_resp_pkt->request_handle = lock_get_unlock_pkt->request_handle;
-
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
-
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) lock_get_unlock_pkt->addr;
-        MPID_Datatype_get_size_macro(lock_get_unlock_pkt->datatype, type_size);
-        iov[1].MPID_IOV_LEN = lock_get_unlock_pkt->count * type_size;
-
-        mpi_errno = MPIDI_CH3_iSendv(vc, req, iov, 2);
-        /* --BEGIN ERROR HANDLING-- */
-        if (mpi_errno != MPI_SUCCESS) {
-            MPIU_Object_set_ref(req, 0);
-            MPIDI_CH3_Request_destroy(req);
-            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-        }
-        /* --END ERROR HANDLING-- */
-    }
-
-    else {
-        /* queue the information */
-        MPIDI_Win_lock_queue *curr_ptr, *prev_ptr, *new_ptr;
-
-        /* FIXME: MT: This may need to be done atomically. */
-
-        curr_ptr = (MPIDI_Win_lock_queue *) win_ptr->lock_queue;
-        prev_ptr = curr_ptr;
-        while (curr_ptr != NULL) {
-            prev_ptr = curr_ptr;
-            curr_ptr = curr_ptr->next;
-        }
-
-        new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
-        if (!new_ptr) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                 "MPIDI_Win_lock_queue");
-        }
-        new_ptr->pt_single_op = (MPIDI_PT_single_op *) MPIU_Malloc(sizeof(MPIDI_PT_single_op));
-        if (new_ptr->pt_single_op == NULL) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                 "MPIDI_PT_Single_op");
-        }
-
-        if (prev_ptr != NULL)
-            prev_ptr->next = new_ptr;
-        else
-            win_ptr->lock_queue = new_ptr;
-
-        new_ptr->next = NULL;
-        new_ptr->lock_type = lock_get_unlock_pkt->lock_type;
-        new_ptr->source_win_handle = lock_get_unlock_pkt->source_win_handle;
-        new_ptr->origin_rank = lock_get_unlock_pkt->origin_rank;
-
-        new_ptr->pt_single_op->type = MPIDI_CH3_PKT_LOCK_GET_UNLOCK;
-        new_ptr->pt_single_op->flags = lock_get_unlock_pkt->flags;
-        new_ptr->pt_single_op->addr = lock_get_unlock_pkt->addr;
-        new_ptr->pt_single_op->count = lock_get_unlock_pkt->count;
-        new_ptr->pt_single_op->datatype = lock_get_unlock_pkt->datatype;
-        new_ptr->pt_single_op->data = NULL;
-        new_ptr->pt_single_op->request_handle = lock_get_unlock_pkt->request_handle;
-        new_ptr->pt_single_op->data_recd = 1;
-    }
-
-    *rreqp = NULL;
-
-  fn_fail:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKGETUNLOCK);
-    return mpi_errno;
-}
-
-
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_PktHandler_LockAccumUnlock
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_PktHandler_LockAccumUnlock(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
-                                         MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
-{
-    MPIDI_CH3_Pkt_lock_accum_unlock_t *lock_accum_unlock_pkt = &pkt->lock_accum_unlock;
-    MPID_Request *req = NULL;
-    MPID_Win *win_ptr = NULL;
-    MPIDI_Win_lock_queue *curr_ptr = NULL, *prev_ptr = NULL, *new_ptr = NULL;
-    MPI_Aint type_size;
-    int complete;
-    char *data_buf = NULL;
-    MPIDI_msg_sz_t data_len;
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKACCUMUNLOCK);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKACCUMUNLOCK);
-
-    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received lock_accum_unlock pkt");
-
-    /* no need to acquire the lock here because we need to receive the
-     * data into a temporary buffer first */
-
-    data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
-    data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
-
-    req = MPID_Request_create();
-    MPIU_Object_set_ref(req, 1);
-
-    req->dev.datatype = lock_accum_unlock_pkt->datatype;
-    MPID_Datatype_get_size_macro(lock_accum_unlock_pkt->datatype, type_size);
-    req->dev.recv_data_sz = type_size * lock_accum_unlock_pkt->count;
-    req->dev.user_count = lock_accum_unlock_pkt->count;
-    req->dev.target_win_handle = lock_accum_unlock_pkt->target_win_handle;
-    req->dev.flags = lock_accum_unlock_pkt->flags;
-
-    /* queue the information */
-
-    new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
-    if (!new_ptr) {
-        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                             "MPIDI_Win_lock_queue");
-    }
-
-    new_ptr->pt_single_op = (MPIDI_PT_single_op *) MPIU_Malloc(sizeof(MPIDI_PT_single_op));
-    if (new_ptr->pt_single_op == NULL) {
-        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                             "MPIDI_PT_single_op");
-    }
-
-    MPID_Win_get_ptr(lock_accum_unlock_pkt->target_win_handle, win_ptr);
-
-    /* FIXME: MT: The queuing may need to be done atomically. */
-
-    curr_ptr = (MPIDI_Win_lock_queue *) win_ptr->lock_queue;
-    prev_ptr = curr_ptr;
-    while (curr_ptr != NULL) {
-        prev_ptr = curr_ptr;
-        curr_ptr = curr_ptr->next;
-    }
-
-    if (prev_ptr != NULL)
-        prev_ptr->next = new_ptr;
-    else
-        win_ptr->lock_queue = new_ptr;
-
-    new_ptr->next = NULL;
-    new_ptr->lock_type = lock_accum_unlock_pkt->lock_type;
-    new_ptr->source_win_handle = lock_accum_unlock_pkt->source_win_handle;
-    new_ptr->origin_rank = lock_accum_unlock_pkt->origin_rank;
-
-    new_ptr->pt_single_op->type = MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK;
-    new_ptr->pt_single_op->flags = lock_accum_unlock_pkt->flags;
-    new_ptr->pt_single_op->addr = lock_accum_unlock_pkt->addr;
-    new_ptr->pt_single_op->count = lock_accum_unlock_pkt->count;
-    new_ptr->pt_single_op->datatype = lock_accum_unlock_pkt->datatype;
-    new_ptr->pt_single_op->op = lock_accum_unlock_pkt->op;
-    /* allocate memory to receive the data */
-    new_ptr->pt_single_op->data = MPIU_Malloc(req->dev.recv_data_sz);
-    if (new_ptr->pt_single_op->data == NULL) {
-        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                             req->dev.recv_data_sz);
-    }
-
-    new_ptr->pt_single_op->data_recd = 0;
-
-    MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PT_SINGLE_ACCUM);
-    req->dev.user_buf = new_ptr->pt_single_op->data;
-    req->dev.lock_queue_entry = new_ptr;
-
-    *rreqp = req;
-    mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
-    /* FIXME:  Only change the handling of completion if
-     * post_data_receive reset the handler.  There should
-     * be a cleaner way to do this */
-    if (!req->dev.OnDataAvail) {
-        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_SinglePutAccumComplete;
-    }
-    if (mpi_errno != MPI_SUCCESS) {
-        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
-                      "**ch3|postrecv %s", "MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK");
-    }
-    /* return the number of bytes processed in this function */
-    *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
-
-    if (complete) {
-        mpi_errno = MPIDI_CH3_ReqHandler_SinglePutAccumComplete(vc, req, &complete);
-        if (complete) {
-            *rreqp = NULL;
-        }
-    }
-  fn_fail:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKACCUMUNLOCK);
-    return mpi_errno;
-}
 
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3_PktHandler_GetResp
@@ -1651,8 +1201,6 @@ int MPIDI_CH3_PktHandler_LockGranted(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
 
     MPID_Win_get_ptr(lock_granted_pkt->source_win_handle, win_ptr);
-    /* set the remote_lock_state flag in the window */
-    win_ptr->targets[lock_granted_pkt->target_rank].remote_lock_state = MPIDI_CH3_WIN_LOCK_GRANTED;
 
     mpi_errno = set_lock_sync_counter(win_ptr, target_rank);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
@@ -1697,16 +1245,6 @@ int MPIDI_CH3_PktHandler_FlushAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-    MPIU_Assert(win_ptr->targets[flush_ack_pkt->target_rank].remote_lock_state !=
-                MPIDI_CH3_WIN_LOCK_NONE);
-
-    if (win_ptr->targets[flush_ack_pkt->target_rank].remote_lock_state ==
-        MPIDI_CH3_WIN_LOCK_FLUSH)
-        win_ptr->targets[flush_ack_pkt->target_rank].remote_lock_state =
-            MPIDI_CH3_WIN_LOCK_GRANTED;
-    else
-        win_ptr->targets[flush_ack_pkt->target_rank].remote_lock_state = MPIDI_CH3_WIN_LOCK_NONE;
-
     *rreqp = NULL;
     MPIDI_CH3_Progress_signal_completion();
 
@@ -1813,33 +1351,6 @@ int MPIDI_CH3_PktHandler_Flush(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                                               flush_pkt->source_win_handle);
     if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-    /* This is a flush request packet */
-    if (flush_pkt->target_win_handle != MPI_WIN_NULL) {
-        MPID_Request *req = NULL;
-
-        MPID_Win_get_ptr(flush_pkt->target_win_handle, win_ptr);
-
-        flush_pkt->target_win_handle = MPI_WIN_NULL;
-        flush_pkt->target_rank = win_ptr->comm_ptr->rank;
-
-        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iStartMsg(vc, flush_pkt, sizeof(*flush_pkt), &req);
-        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-        MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
-
-        /* Release the request returned by iStartMsg */
-        if (req != NULL) {
-            MPID_Request_release(req);
-        }
-    }
-
-    /* This is a flush response packet */
-    else {
-        MPID_Win_get_ptr(flush_pkt->source_win_handle, win_ptr);
-        win_ptr->targets[flush_pkt->target_rank].remote_lock_state = MPIDI_CH3_WIN_LOCK_GRANTED;
-        MPIDI_CH3_Progress_signal_completion();
-    }
-
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_FLUSH);
     return mpi_errno;
@@ -1850,139 +1361,6 @@ int MPIDI_CH3_PktHandler_Flush(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 }
 
 
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_Start_rma_op_target
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_Start_rma_op_target(MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_START_RMA_OP_TARGET);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_START_RMA_OP_TARGET);
-
-    /* Lock with NOCHECK is piggybacked on this message.  We should be able to
-     * immediately grab the lock.  Otherwise, there is a synchronization error. */
-    if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK && flags & MPIDI_CH3_PKT_FLAG_RMA_NOCHECK) {
-        int lock_acquired;
-        int lock_mode;
-
-        if (flags & MPIDI_CH3_PKT_FLAG_RMA_SHARED) {
-            lock_mode = MPI_LOCK_SHARED;
-        }
-        else if (flags & MPIDI_CH3_PKT_FLAG_RMA_EXCLUSIVE) {
-            lock_mode = MPI_LOCK_EXCLUSIVE;
-        }
-        else {
-            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_RMA_SYNC, "**ch3|rma_flags");
-        }
-
-        lock_acquired = MPIDI_CH3I_Try_acquire_win_lock(win_ptr, lock_mode);
-        MPIU_ERR_CHKANDJUMP(!lock_acquired, mpi_errno, MPI_ERR_RMA_SYNC, "**ch3|nocheck_invalid");
-    }
-
-  fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_START_RMA_OP_TARGET);
-    return mpi_errno;
-    /* --BEGIN ERROR HANDLING-- */
-  fn_fail:
-    goto fn_exit;
-    /* --END ERROR HANDLING-- */
-}
-
-
-#undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_Finish_rma_op_target
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_Finish_rma_op_target(MPIDI_VC_t * vc, MPID_Win * win_ptr, int is_rma_update,
-                                   MPIDI_CH3_Pkt_flags_t flags, MPI_Win source_win_handle)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_FINISH_RMA_OP_TARGET);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_FINISH_RMA_OP_TARGET);
-
-    /* This function should be called by the target process after each RMA
-     * operation is completed, to update synchronization state. */
-
-    /* Last RMA operation from source. If active target RMA, decrement window
-     * counter. */
-    if (flags & MPIDI_CH3_PKT_FLAG_RMA_AT_COMPLETE) {
-        MPIU_Assert(win_ptr->current_lock_type == MPID_LOCK_NONE);
-
-        win_ptr->at_completion_counter -= 1;
-        MPIU_Assert(win_ptr->at_completion_counter >= 0);
-
-        /* Signal the local process when the op counter reaches 0. */
-        if (win_ptr->at_completion_counter == 0)
-            MPIDI_CH3_Progress_signal_completion();
-    }
-
-    /* If passive target RMA, release lock on window and grant next lock in the
-     * lock queue if there is any.  If requested by the origin, send an ack back
-     * to indicate completion at the target. */
-    else if (flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
-        MPIU_Assert(win_ptr->current_lock_type != MPID_LOCK_NONE);
-
-        if (flags & MPIDI_CH3_PKT_FLAG_RMA_REQ_ACK) {
-            MPIU_Assert(source_win_handle != MPI_WIN_NULL && vc != NULL);
-            mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, source_win_handle);
-            if (mpi_errno) {
-                MPIU_ERR_POP(mpi_errno);
-            }
-        }
-
-        mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
-
-        /* The local process may be waiting for the lock.  Signal completion to
-         * wake it up, so it can attempt to grab the lock. */
-        MPIDI_CH3_Progress_signal_completion();
-    }
-    else if (flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) {
-        /* Ensure store instructions have been performed before flush call is
-         * finished on origin process. */
-        OPA_read_write_barrier();
-
-        if (flags & MPIDI_CH3_PKT_FLAG_RMA_REQ_ACK) {
-            MPIDI_CH3_Pkt_t upkt;
-            MPIDI_CH3_Pkt_flush_t *flush_pkt = &upkt.flush;
-            MPID_Request *req = NULL;
-
-            MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received piggybacked flush request");
-            MPIU_Assert(source_win_handle != MPI_WIN_NULL && vc != NULL);
-
-            MPIDI_Pkt_init(flush_pkt, MPIDI_CH3_PKT_FLUSH);
-            flush_pkt->source_win_handle = source_win_handle;
-            flush_pkt->target_win_handle = MPI_WIN_NULL;
-            flush_pkt->target_rank = win_ptr->comm_ptr->rank;
-
-            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-            mpi_errno = MPIDI_CH3_iStartMsg(vc, flush_pkt, sizeof(*flush_pkt), &req);
-            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-            MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER,
-                                "**ch3|rma_msg");
-
-            /* Release the request returned by iStartMsg */
-            if (req != NULL) {
-                MPID_Request_release(req);
-            }
-        }
-    }
-
-  fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_FINISH_RMA_OP_TARGET);
-    return mpi_errno;
-    /* --BEGIN ERROR HANDLING-- */
-  fn_fail:
-    goto fn_exit;
-    /* --END ERROR HANDLING-- */
-}
-
-
 /* ------------------------------------------------------------------------ */
 /*
  * For debugging, we provide the following functions for printing the
@@ -2042,20 +1420,6 @@ int MPIDI_CH3_PktPrint_Accumulate(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
     return MPI_SUCCESS;
 }
 
-int MPIDI_CH3_PktPrint_Accum_Immed(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
-{
-    MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_ACCUM_IMMED\n"));
-    MPIU_DBG_PRINTF((" addr ......... %p\n", pkt->accum_immed.addr));
-    MPIU_DBG_PRINTF((" count ........ %d\n", pkt->accum_immed.count));
-    MPIU_DBG_PRINTF((" datatype ..... 0x%08X\n", pkt->accum_immed.datatype));
-    MPIU_DBG_PRINTF((" op ........... 0x%08X\n", pkt->accum_immed.op));
-    MPIU_DBG_PRINTF((" target ....... 0x%08X\n", pkt->accum_immed.target_win_handle));
-    MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->accum_immed.source_win_handle));
-    /*MPIU_DBG_PRINTF((" win_ptr ...... 0x%08X\n", pkt->accum.win_ptr)); */
-    fflush(stdout);
-    return MPI_SUCCESS;
-}
-
 int MPIDI_CH3_PktPrint_Lock(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
 {
     MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_LOCK\n"));
@@ -2065,43 +1429,6 @@ int MPIDI_CH3_PktPrint_Lock(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
     return MPI_SUCCESS;
 }
 
-int MPIDI_CH3_PktPrint_LockPutUnlock(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
-{
-    MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_LOCK_PUT_UNLOCK\n"));
-    MPIU_DBG_PRINTF((" addr ......... %p\n", pkt->lock_put_unlock.addr));
-    MPIU_DBG_PRINTF((" count ........ %d\n", pkt->lock_put_unlock.count));
-    MPIU_DBG_PRINTF((" datatype ..... 0x%08X\n", pkt->lock_put_unlock.datatype));
-    MPIU_DBG_PRINTF((" lock_type .... %d\n", pkt->lock_put_unlock.lock_type));
-    MPIU_DBG_PRINTF((" target ....... 0x%08X\n", pkt->lock_put_unlock.target_win_handle));
-    MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->lock_put_unlock.source_win_handle));
-    return MPI_SUCCESS;
-}
-
-int MPIDI_CH3_PktPrint_LockAccumUnlock(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
-{
-    MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK\n"));
-    MPIU_DBG_PRINTF((" addr ......... %p\n", pkt->lock_accum_unlock.addr));
-    MPIU_DBG_PRINTF((" count ........ %d\n", pkt->lock_accum_unlock.count));
-    MPIU_DBG_PRINTF((" datatype ..... 0x%08X\n", pkt->lock_accum_unlock.datatype));
-    MPIU_DBG_PRINTF((" lock_type .... %d\n", pkt->lock_accum_unlock.lock_type));
-    MPIU_DBG_PRINTF((" target ....... 0x%08X\n", pkt->lock_accum_unlock.target_win_handle));
-    MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->lock_accum_unlock.source_win_handle));
-    return MPI_SUCCESS;
-}
-
-int MPIDI_CH3_PktPrint_LockGetUnlock(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
-{
-    MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_LOCK_GET_UNLOCK\n"));
-    MPIU_DBG_PRINTF((" addr ......... %p\n", pkt->lock_get_unlock.addr));
-    MPIU_DBG_PRINTF((" count ........ %d\n", pkt->lock_get_unlock.count));
-    MPIU_DBG_PRINTF((" datatype ..... 0x%08X\n", pkt->lock_get_unlock.datatype));
-    MPIU_DBG_PRINTF((" lock_type .... %d\n", pkt->lock_get_unlock.lock_type));
-    MPIU_DBG_PRINTF((" target ....... 0x%08X\n", pkt->lock_get_unlock.target_win_handle));
-    MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->lock_get_unlock.source_win_handle));
-    MPIU_DBG_PRINTF((" request ...... 0x%08X\n", pkt->lock_get_unlock.request_handle));
-    return MPI_SUCCESS;
-}
-
 int MPIDI_CH3_PktPrint_FlushAck(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
 {
     MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_FLUSH_ACK\n"));
diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index be20bd2..3a5a9f0 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -7,121 +7,6 @@
 #include "mpidimpl.h"
 #include "mpidrma.h"
 
-/*
-=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
-
-categories:
-    - name        : CH3
-      description : cvars that control behavior of ch3
-
-cvars:
-    - name        : MPIR_CVAR_CH3_RMA_NREQUEST_THRESHOLD
-      category    : CH3
-      type        : int
-      default     : 4000
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_ALL_EQ
-      description : >-
-        Threshold at which the RMA implementation attempts to complete requests
-        while completing RMA operations and while using the lazy synchonization
-        approach.  Change this value if programs fail because they run out of
-        requests or other internal resources
-
-    - name        : MPIR_CVAR_CH3_RMA_NREQUEST_NEW_THRESHOLD
-      category    : CH3
-      type        : int
-      default     : 0
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_ALL_EQ
-      description : >-
-        Threshold for the number of new requests since the last attempt to
-        complete pending requests.  Higher values can increase performance,
-        but may run the risk of exceeding the available number of requests
-        or other internal resources.
-
-    - name        : MPIR_CVAR_CH3_RMA_GC_NUM_COMPLETED
-      category    : CH3
-      type        : int
-      default     : (-1)
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_ALL_EQ
-      description : >-
-        Threshold for the number of completed requests the runtime finds
-        before it stops trying to find more completed requests in garbage
-        collection function.
-        Note that it works with MPIR_CVAR_CH3_RMA_GC_NUM_TESTED as an OR
-        relation, which means runtime will stop checking when either one
-        of its following conditions is satisfied or one of conditions of
-        MPIR_CVAR_CH3_RMA_GC_NUM_TESTED is satisfied.
-        When it is set to negative value, it means runtime will not stop
-        checking the operation list until it reaches the end of the list.
-        When it is set to positive value, it means runtime will not stop
-        checking the operation list until it finds certain number of
-        completed requests. When it is set to zero value, the outcome is
-        undefined.
-        Note that in garbage collection function, if runtime finds a chain
-        of completed RMA requests, it will temporarily ignore this CVAR
-        and try to find continuous completed requests as many as possible,
-        until it meets an incomplete request.
-
-    - name        : MPIR_CVAR_CH3_RMA_GC_NUM_TESTED
-      category    : CH3
-      type        : int
-      default     : 100
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_ALL_EQ
-      description : >-
-        Threshold for the number of RMA requests the runtime tests before
-        it stops trying to check more requests in garbage collection
-        routine.
-        Note that it works with MPIR_CVAR_CH3_RMA_GC_NUM_COMPLETED as an
-        OR relation, which means runtime will stop checking when either
-        one of its following conditions is satisfied or one of conditions
-        of MPIR_CVAR_CH3_RMA_GC_NUM_COMPLETED is satisfied.
-        When it is set to negative value, runtime will not stop checking
-        operation list until runtime reaches the end of the list. It has
-        the risk of O(N) traversing overhead if there is no completed
-        request in the list. When it is set to positive value, it means
-        runtime will not stop checking the operation list until it visits
-        such number of requests. Higher values may make more completed
-        requests to be found, but it has the risk of visiting too many
-        requests, leading to significant performance overhead. When it is
-        set to zero value, runtime will stop checking the operation list
-        immediately, which may cause weird performance in practice.
-        Note that in garbage collection function, if runtime finds a chain
-        of completed RMA requests, it will temporarily ignore this CVAR and
-        try to find continuous completed requests as many as possible, until
-        it meets an incomplete request.
-
-    - name        : MPIR_CVAR_CH3_RMA_LOCK_IMMED
-      category    : CH3
-      type        : boolean
-      default     : false
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_ALL_EQ
-      description : >-
-        Issue a request for the passive target RMA lock immediately.  Default
-        behavior is to defer the lock request until the call to MPI_Win_unlock.
-
-    - name        : MPIR_CVAR_CH3_RMA_MERGE_LOCK_OP_UNLOCK
-      category    : CH3
-      type        : boolean
-      default     : true
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_ALL_EQ
-      description : >-
-        Enable/disable an optimization that merges lock, op, and unlock
-        messages, for single-operation passive target epochs.
-
-=== END_MPI_T_CVAR_INFO_BLOCK ===
-*/
-
 /* Notes for memory barriers in RMA synchronizations
 
    When SHM is allocated for RMA window, we need to add memory berriers at proper
@@ -344,18 +229,6 @@ void MPIDI_CH3_RMA_Init_Pvars(void)
 
 #define SYNC_POST_TAG 100
 
-static int wait_for_lock_granted(MPID_Win * win_ptr, int target_rank);
-static int do_passive_target_rma(MPID_Win * win_ptr, int target_rank,
-                                 int *wait_for_rma_done_pkt, MPIDI_CH3_Pkt_flags_t sync_flags);
-static int send_lock_put_or_acc(MPID_Win *, int);
-static int send_lock_get(MPID_Win *, int);
-static inline int rma_list_complete(MPID_Win * win_ptr, MPIDI_RMA_Ops_list_t * ops_list,
-                                    MPIDI_RMA_Ops_list_t * ops_list_tail);
-static inline int rma_list_gc(MPID_Win * win_ptr,
-                              MPIDI_RMA_Ops_list_t * ops_list,
-                              MPIDI_RMA_Ops_list_t * ops_list_tail,
-                              MPIDI_RMA_Op_t * last_elm, int *nDone);
-
 
 #undef FUNCNAME
 #define FUNCNAME MPIDI_Win_fence
@@ -1688,196 +1561,6 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
 
 
 #undef FUNCNAME
-#define FUNCNAME do_passive_target_rma
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int do_passive_target_rma(MPID_Win * win_ptr, int target_rank,
-                                 int *wait_for_rma_done_pkt, MPIDI_CH3_Pkt_flags_t sync_flags)
-{
-    int mpi_errno = MPI_SUCCESS, nops;
-    MPIDI_RMA_Op_t *curr_ptr;
-    int nRequest = 0, nRequestNew = 0;
-    MPIDI_STATE_DECL(MPID_STATE_DO_PASSIVE_TARGET_RMA);
-
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_DO_PASSIVE_TARGET_RMA);
-
-    MPIU_Assert(win_ptr->targets[target_rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_GRANTED ||
-                win_ptr->targets[target_rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_FLUSH ||
-                (win_ptr->targets[target_rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_CALLED &&
-                 win_ptr->targets[target_rank].remote_lock_assert & MPI_MODE_NOCHECK));
-
-    if (MPIDI_CH3I_RMA_Ops_isempty(&win_ptr->targets[target_rank].rma_ops_list)) {
-        /* The ops list is empty -- NOTE: we assume this is because the epoch
-         * was flushed.  Any issued ops are already remote complete; done
-         * packet is not needed for safe third party communication. */
-        *wait_for_rma_done_pkt = 0;
-    }
-    else {
-        MPIDI_RMA_Op_t *tail = MPIDI_CH3I_RMA_Ops_tail(&win_ptr->targets[target_rank].rma_ops_list_tail);
-
-        /* Check if we can piggyback the RMA done acknowlegdement on the last
-         * operation in the epoch. */
-
-        if (tail->pkt.type == MPIDI_CH3_PKT_GET ||
-            tail->pkt.type == MPIDI_CH3_PKT_CAS ||
-            tail->pkt.type == MPIDI_CH3_PKT_FOP || tail->pkt.type == MPIDI_CH3_PKT_GET_ACCUM) {
-            /* last operation sends a response message. no need to wait
-             * for an additional rma done pkt */
-            *wait_for_rma_done_pkt = 0;
-        }
-        else {
-            /* Check if there is a get operation, which can be be performed
-             * moved to the end to piggyback the RMA done acknowledgement.  Go
-             * through the list and move the first get operation (if there is
-             * one) to the end. */
-
-            *wait_for_rma_done_pkt = 1;
-            curr_ptr = MPIDI_CH3I_RMA_Ops_head(&win_ptr->targets[target_rank].rma_ops_list);
-
-            while (curr_ptr != NULL) {
-                if (curr_ptr->pkt.type == MPIDI_CH3_PKT_GET) {
-                    /* Found a GET, move it to the end */
-                    *wait_for_rma_done_pkt = 0;
-
-                    MPIDI_CH3I_RMA_Ops_unlink(&win_ptr->targets[target_rank].rma_ops_list,
-                                              &win_ptr->targets[target_rank].rma_ops_list_tail,
-                                              curr_ptr);
-                    MPIDI_CH3I_RMA_Ops_append(&win_ptr->targets[target_rank].rma_ops_list,
-                                              &win_ptr->targets[target_rank].rma_ops_list_tail,
-                                              curr_ptr);
-                    break;
-                }
-                else {
-                    curr_ptr = curr_ptr->next;
-                }
-            }
-        }
-    }
-
-    curr_ptr = MPIDI_CH3I_RMA_Ops_head(&win_ptr->targets[target_rank].rma_ops_list);
-
-    nops = 0;
-    while (curr_ptr != NULL) {
-        nops++;
-        curr_ptr = curr_ptr->next;
-    }
-
-    curr_ptr = MPIDI_CH3I_RMA_Ops_head(&win_ptr->targets[target_rank].rma_ops_list);
-
-    while (curr_ptr != NULL) {
-        MPIDI_CH3_Pkt_flags_t flags = MPIDI_CH3_PKT_FLAG_NONE;
-
-        /* Assertion: (curr_ptr != NULL) => (nops > 0) */
-        MPIU_Assert(nops > 0);
-        MPIU_Assert(curr_ptr->target_rank == target_rank);
-
-        /* Piggyback the lock operation on the first op */
-        if (win_ptr->targets[target_rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_CALLED) {
-            MPIU_Assert(win_ptr->targets[target_rank].remote_lock_assert & MPI_MODE_NOCHECK);
-            flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK | MPIDI_CH3_PKT_FLAG_RMA_NOCHECK;
-
-            switch (win_ptr->targets[target_rank].remote_lock_mode) {
-            case MPI_LOCK_SHARED:
-                flags |= MPIDI_CH3_PKT_FLAG_RMA_SHARED;
-                break;
-            case MPI_LOCK_EXCLUSIVE:
-                flags |= MPIDI_CH3_PKT_FLAG_RMA_EXCLUSIVE;
-                break;
-            default:
-                MPIU_Assert(0);
-                break;
-            }
-
-            win_ptr->targets[target_rank].remote_lock_state = MPIDI_CH3_WIN_LOCK_GRANTED;
-        }
-
-        /* Piggyback the unlock/flush operation on the last op */
-        if (curr_ptr->next == NULL) {
-            if (sync_flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
-                flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK;
-            }
-            else if (sync_flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) {
-                flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
-            }
-            else {
-                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_RMA_SYNC, "**ch3|sync_arg",
-                                     "**ch3|sync_arg %d", sync_flags);
-            }
-
-            /* Inform the target that we want an acknowledgement when the
-             * unlock has completed. */
-            if (*wait_for_rma_done_pkt) {
-                flags |= MPIDI_CH3_PKT_FLAG_RMA_REQ_ACK;
-            }
-        }
-
-        mpi_errno = MPIDI_CH3I_Issue_rma_op(curr_ptr, win_ptr, flags);
-        if (mpi_errno)
-            MPIU_ERR_POP(mpi_errno);
-
-        /* If the request is null, we can remove it immediately */
-        if (!curr_ptr->request) {
-            MPIDI_CH3I_RMA_Ops_free_and_next(win_ptr, &win_ptr->targets[target_rank].rma_ops_list,
-                                             &win_ptr->targets[target_rank].rma_ops_list_tail,
-                                             &curr_ptr);
-        }
-        else {
-            nRequest++;
-            curr_ptr = curr_ptr->next;
-            if (nRequest > MPIR_CVAR_CH3_RMA_NREQUEST_THRESHOLD &&
-                nRequest - nRequestNew > MPIR_CVAR_CH3_RMA_NREQUEST_NEW_THRESHOLD) {
-                int nDone = 0;
-                mpi_errno = poke_progress_engine();
-                if (mpi_errno != MPI_SUCCESS)
-                    MPIU_ERR_POP(mpi_errno);
-                mpi_errno =
-                    rma_list_gc(win_ptr, &win_ptr->targets[target_rank].rma_ops_list,
-                                &win_ptr->targets[target_rank].rma_ops_list_tail, curr_ptr,
-                                &nDone);
-                if (mpi_errno != MPI_SUCCESS)
-                    MPIU_ERR_POP(mpi_errno);
-                /* if (nDone > 0) printf("nDone = %d\n", nDone); */
-                nRequest -= nDone;
-                nRequestNew = nRequest;
-            }
-        }
-    }
-
-    if (nops) {
-        mpi_errno = rma_list_complete(win_ptr, &win_ptr->targets[target_rank].rma_ops_list,
-                                      &win_ptr->targets[target_rank].rma_ops_list_tail);
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
-    }
-    else if (sync_flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
-        /* No communication operations were left to process, but the RMA epoch
-         * is open.  Send an unlock message to release the lock at the target.  */
-        mpi_errno = send_unlock_msg(target_rank, win_ptr);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
-        *wait_for_rma_done_pkt = 1;
-    }
-    /* NOTE: Flush -- If RMA ops are issued eagerly, Send_flush_msg should be
-     * called here and wait_for_rma_done_pkt should be set. */
-
-    /* MT: avoid processing unissued operations enqueued by other threads
-       in rma_list_complete() */
-    curr_ptr = MPIDI_CH3I_RMA_Ops_head(&win_ptr->targets[target_rank].rma_ops_list);
-    if (curr_ptr && !curr_ptr->request)
-        goto fn_exit;
-    MPIU_Assert(MPIDI_CH3I_RMA_Ops_isempty(&win_ptr->targets[target_rank].rma_ops_list));
-
-  fn_exit:
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_DO_PASSIVE_TARGET_RMA);
-    return mpi_errno;
-    /* --BEGIN ERROR HANDLING-- */
-  fn_fail:
-    goto fn_exit;
-    /* --END ERROR HANDLING-- */
-}
-
-#undef FUNCNAME
 #define FUNCNAME MPIDI_Win_sync
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -1904,484 +1587,3 @@ int MPIDI_Win_sync(MPID_Win * win_ptr)
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
-
-
-#undef FUNCNAME
-#define FUNCNAME wait_for_lock_granted
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int wait_for_lock_granted(MPID_Win * win_ptr, int target_rank)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_STATE_DECL(MPID_STATE_WAIT_FOR_LOCK_GRANTED);
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_WAIT_FOR_LOCK_GRANTED);
-
-    /* After the target grants the lock, it sends a lock_granted packet. This
-     * packet is received in ch3u_handle_recv_pkt.c.  The handler for the
-     * packet sets the remote_lock_state flag to GRANTED.
-     */
-
-    MPIU_Assert(win_ptr->targets[target_rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_REQUESTED ||
-                win_ptr->targets[target_rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_GRANTED);
-
-    /* poke the progress engine until remote_lock_state flag is set to GRANTED */
-    if (win_ptr->targets[target_rank].remote_lock_state != MPIDI_CH3_WIN_LOCK_GRANTED) {
-        MPID_Progress_state progress_state;
-
-        MPID_Progress_start(&progress_state);
-        while (win_ptr->targets[target_rank].remote_lock_state != MPIDI_CH3_WIN_LOCK_GRANTED) {
-            mpi_errno = MPID_Progress_wait(&progress_state);
-            /* --BEGIN ERROR HANDLING-- */
-            if (mpi_errno != MPI_SUCCESS) {
-                MPID_Progress_end(&progress_state);
-                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winnoprogress");
-            }
-            /* --END ERROR HANDLING-- */
-        }
-        MPID_Progress_end(&progress_state);
-    }
-
-  fn_exit:
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_WAIT_FOR_LOCK_GRANTED);
-    return mpi_errno;
-    /* --BEGIN ERROR HANDLING-- */
-  fn_fail:
-    goto fn_exit;
-    /* --END ERROR HANDLING-- */
-}
-
-
-#undef FUNCNAME
-#define FUNCNAME send_lock_put_or_acc
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int send_lock_put_or_acc(MPID_Win * win_ptr, int target_rank)
-{
-    int mpi_errno = MPI_SUCCESS, lock_type, origin_dt_derived, iovcnt;
-    MPIDI_RMA_Op_t *rma_op;
-    MPID_Request *request = NULL;
-    MPIDI_VC_t *vc;
-    MPID_IOV iov[MPID_IOV_LIMIT];
-    MPID_Comm *comm_ptr;
-    MPID_Datatype *origin_dtp = NULL;
-    MPI_Aint origin_type_size;
-    MPIDI_CH3_Pkt_t upkt;
-    MPIDI_CH3_Pkt_lock_put_unlock_t *lock_put_unlock_pkt = &upkt.lock_put_unlock;
-    MPIDI_CH3_Pkt_lock_accum_unlock_t *lock_accum_unlock_pkt = &upkt.lock_accum_unlock;
-    MPIDI_CH3_Pkt_put_t *put_pkt;
-    MPIDI_CH3_Pkt_accum_t *accum_pkt;
-    MPIDI_CH3_Pkt_accum_immed_t *accumi_pkt;
-
-    MPIDI_STATE_DECL(MPID_STATE_SEND_LOCK_PUT_OR_ACC);
-
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_SEND_LOCK_PUT_OR_ACC);
-
-    lock_type = win_ptr->targets[target_rank].remote_lock_mode;
-
-    rma_op = MPIDI_CH3I_RMA_Ops_head(&win_ptr->targets[target_rank].rma_ops_list);
-
-    if (rma_op->pkt.type == MPIDI_CH3_PKT_PUT) {
-        put_pkt = &rma_op->pkt.put;
-
-        MPIDI_Pkt_init(lock_put_unlock_pkt, MPIDI_CH3_PKT_LOCK_PUT_UNLOCK);
-        lock_put_unlock_pkt->flags = MPIDI_CH3_PKT_FLAG_RMA_LOCK |
-            MPIDI_CH3_PKT_FLAG_RMA_UNLOCK | MPIDI_CH3_PKT_FLAG_RMA_REQ_ACK;
-        lock_put_unlock_pkt->target_win_handle = win_ptr->all_win_handles[rma_op->target_rank];
-        lock_put_unlock_pkt->source_win_handle = win_ptr->handle;
-        lock_put_unlock_pkt->lock_type = lock_type;
-        lock_put_unlock_pkt->origin_rank = win_ptr->comm_ptr->rank;
-
-        lock_put_unlock_pkt->addr = put_pkt->addr;
-        lock_put_unlock_pkt->count = put_pkt->count;
-        lock_put_unlock_pkt->datatype = put_pkt->datatype;
-
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) lock_put_unlock_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*lock_put_unlock_pkt);
-    }
-
-    else if (rma_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE) {
-        accum_pkt = &rma_op->pkt.accum;
-
-        MPIDI_Pkt_init(lock_accum_unlock_pkt, MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK);
-        lock_accum_unlock_pkt->flags = MPIDI_CH3_PKT_FLAG_RMA_LOCK |
-            MPIDI_CH3_PKT_FLAG_RMA_UNLOCK | MPIDI_CH3_PKT_FLAG_RMA_REQ_ACK;
-        lock_accum_unlock_pkt->target_win_handle = win_ptr->all_win_handles[rma_op->target_rank];
-        lock_accum_unlock_pkt->source_win_handle = win_ptr->handle;
-        lock_accum_unlock_pkt->lock_type = lock_type;
-        lock_accum_unlock_pkt->origin_rank = win_ptr->comm_ptr->rank;
-
-        lock_accum_unlock_pkt->addr = accum_pkt->addr;
-        lock_accum_unlock_pkt->count = accum_pkt->count;
-        lock_accum_unlock_pkt->datatype = accum_pkt->datatype;
-        lock_accum_unlock_pkt->op = accum_pkt->op;
-
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) lock_accum_unlock_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*lock_accum_unlock_pkt);
-    }
-    else if (rma_op->pkt.type == MPIDI_CH3_PKT_ACCUM_IMMED) {
-        accumi_pkt = &rma_op->pkt.accum_immed;
-
-        MPIDI_Pkt_init(lock_accum_unlock_pkt, MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK);
-        lock_accum_unlock_pkt->flags = MPIDI_CH3_PKT_FLAG_RMA_LOCK |
-            MPIDI_CH3_PKT_FLAG_RMA_UNLOCK | MPIDI_CH3_PKT_FLAG_RMA_REQ_ACK;
-        lock_accum_unlock_pkt->target_win_handle = win_ptr->all_win_handles[rma_op->target_rank];
-        lock_accum_unlock_pkt->source_win_handle = win_ptr->handle;
-        lock_accum_unlock_pkt->lock_type = lock_type;
-        lock_accum_unlock_pkt->origin_rank = win_ptr->comm_ptr->rank;
-
-        lock_accum_unlock_pkt->addr = accumi_pkt->addr;
-        lock_accum_unlock_pkt->count = accumi_pkt->count;
-        lock_accum_unlock_pkt->datatype = accumi_pkt->datatype;
-        lock_accum_unlock_pkt->op = accumi_pkt->op;
-
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) lock_accum_unlock_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*lock_accum_unlock_pkt);
-    }
-    else {
-        /* FIXME: Error return */
-        printf("expected short accumulate...\n");
-        /* */
-    }
-
-    comm_ptr = win_ptr->comm_ptr;
-    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
-
-    if (!MPIR_DATATYPE_IS_PREDEFINED(rma_op->origin_datatype)) {
-        origin_dt_derived = 1;
-        MPID_Datatype_get_ptr(rma_op->origin_datatype, origin_dtp);
-    }
-    else {
-        origin_dt_derived = 0;
-    }
-
-    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
-
-    if (!origin_dt_derived) {
-        /* basic datatype on origin */
-
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) rma_op->origin_addr;
-        iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
-        iovcnt = 2;
-
-        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, iovcnt, &request);
-        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-        if (mpi_errno != MPI_SUCCESS) {
-            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-        }
-    }
-    else {
-        /* derived datatype on origin */
-
-        iovcnt = 1;
-
-        request = MPID_Request_create();
-        if (request == NULL) {
-            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
-        }
-
-        MPIU_Object_set_ref(request, 2);
-        request->kind = MPID_REQUEST_SEND;
-
-        request->dev.datatype_ptr = origin_dtp;
-        /* this will cause the datatype to be freed when the request
-         * is freed. */
-
-        request->dev.segment_ptr = MPID_Segment_alloc();
-        MPIU_ERR_CHKANDJUMP1(request->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem",
-                             "**nomem %s", "MPID_Segment_alloc");
-
-        MPID_Segment_init(rma_op->origin_addr, rma_op->origin_count,
-                          rma_op->origin_datatype, request->dev.segment_ptr, 0);
-        request->dev.segment_first = 0;
-        request->dev.segment_size = rma_op->origin_count * origin_type_size;
-
-        request->dev.OnFinal = 0;
-        request->dev.OnDataAvail = 0;
-
-        mpi_errno = vc->sendNoncontig_fn(vc, request, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
-        /* --BEGIN ERROR HANDLING-- */
-        if (mpi_errno) {
-            MPID_Datatype_release(request->dev.datatype_ptr);
-            MPID_Request_release(request);
-            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|loadsendiov");
-        }
-        /* --END ERROR HANDLING-- */
-    }
-
-    if (request != NULL) {
-        if (!MPID_Request_is_complete(request)) {
-            MPID_Progress_state progress_state;
-
-            MPID_Progress_start(&progress_state);
-            while (!MPID_Request_is_complete(request)) {
-                mpi_errno = MPID_Progress_wait(&progress_state);
-                /* --BEGIN ERROR HANDLING-- */
-                if (mpi_errno != MPI_SUCCESS) {
-                    MPID_Progress_end(&progress_state);
-                    MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
-                }
-                /* --END ERROR HANDLING-- */
-            }
-            MPID_Progress_end(&progress_state);
-        }
-
-        mpi_errno = request->status.MPI_ERROR;
-        if (mpi_errno != MPI_SUCCESS) {
-            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
-        }
-
-        MPID_Request_release(request);
-    }
-
-    /* Free MPIDI_RMA_Ops_list - the lock packet should still be in place, so
-     * we have to free two elements. */
-    MPIDI_CH3I_RMA_Ops_free(win_ptr, &win_ptr->targets[target_rank].rma_ops_list,
-                            &win_ptr->targets[target_rank].rma_ops_list_tail);
-
-  fn_fail:
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_SEND_LOCK_PUT_OR_ACC);
-    return mpi_errno;
-}
-
-
-#undef FUNCNAME
-#define FUNCNAME send_lock_get
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int send_lock_get(MPID_Win * win_ptr, int target_rank)
-{
-    int mpi_errno = MPI_SUCCESS, lock_type;
-    MPIDI_RMA_Op_t *rma_op;
-    MPID_Request *rreq = NULL, *sreq = NULL;
-    MPIDI_VC_t *vc;
-    MPID_Comm *comm_ptr;
-    MPID_Datatype *dtp;
-    MPIDI_CH3_Pkt_t upkt;
-    MPIDI_CH3_Pkt_lock_get_unlock_t *lock_get_unlock_pkt = &upkt.lock_get_unlock;
-    MPIDI_CH3_Pkt_get_t *get_pkt;
-
-    MPIDI_STATE_DECL(MPID_STATE_SEND_LOCK_GET);
-
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_SEND_LOCK_GET);
-
-    lock_type = win_ptr->targets[target_rank].remote_lock_mode;
-
-    rma_op = MPIDI_CH3I_RMA_Ops_head(&win_ptr->targets[target_rank].rma_ops_list);
-
-    /* create a request, store the origin buf, cnt, datatype in it,
-     * and pass a handle to it in the get packet. When the get
-     * response comes from the target, it will contain the request
-     * handle. */
-    rreq = MPID_Request_create();
-    if (rreq == NULL) {
-        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
-    }
-
-    MPIU_Object_set_ref(rreq, 2);
-
-    rreq->dev.user_buf = rma_op->origin_addr;
-    rreq->dev.user_count = rma_op->origin_count;
-    rreq->dev.datatype = rma_op->origin_datatype;
-    rreq->dev.target_win_handle = MPI_WIN_NULL;
-    rreq->dev.source_win_handle = win_ptr->handle;
-
-    if (!MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype)) {
-        MPID_Datatype_get_ptr(rreq->dev.datatype, dtp);
-        rreq->dev.datatype_ptr = dtp;
-        /* this will cause the datatype to be freed when the
-         * request is freed. */
-    }
-
-    get_pkt = &rma_op->pkt.get;
-
-    MPIDI_Pkt_init(lock_get_unlock_pkt, MPIDI_CH3_PKT_LOCK_GET_UNLOCK);
-    lock_get_unlock_pkt->flags = MPIDI_CH3_PKT_FLAG_RMA_LOCK | MPIDI_CH3_PKT_FLAG_RMA_UNLOCK;   /* FIXME | MPIDI_CH3_PKT_FLAG_RMA_REQ_ACK; */
-    lock_get_unlock_pkt->target_win_handle = win_ptr->all_win_handles[rma_op->target_rank];
-    lock_get_unlock_pkt->source_win_handle = win_ptr->handle;
-    lock_get_unlock_pkt->lock_type = lock_type;
-    lock_get_unlock_pkt->origin_rank = win_ptr->comm_ptr->rank;
-
-    lock_get_unlock_pkt->addr = get_pkt->addr;
-    lock_get_unlock_pkt->count = get_pkt->count;
-    lock_get_unlock_pkt->datatype = get_pkt->datatype;
-    lock_get_unlock_pkt->request_handle = rreq->handle;
-
-    comm_ptr = win_ptr->comm_ptr;
-    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
-
-    MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-    mpi_errno = MPIDI_CH3_iStartMsg(vc, lock_get_unlock_pkt, sizeof(*lock_get_unlock_pkt), &sreq);
-    MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-    if (mpi_errno != MPI_SUCCESS) {
-        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-    }
-
-    /* release the request returned by iStartMsg */
-    if (sreq != NULL) {
-        MPID_Request_release(sreq);
-    }
-
-    /* now wait for the data to arrive */
-    if (!MPID_Request_is_complete(rreq)) {
-        MPID_Progress_state progress_state;
-
-        MPID_Progress_start(&progress_state);
-        while (!MPID_Request_is_complete(rreq)) {
-            mpi_errno = MPID_Progress_wait(&progress_state);
-            /* --BEGIN ERROR HANDLING-- */
-            if (mpi_errno != MPI_SUCCESS) {
-                MPID_Progress_end(&progress_state);
-                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
-            }
-            /* --END ERROR HANDLING-- */
-        }
-        MPID_Progress_end(&progress_state);
-    }
-
-    mpi_errno = rreq->status.MPI_ERROR;
-    if (mpi_errno != MPI_SUCCESS) {
-        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
-    }
-
-    /* if origin datatype was a derived datatype, it will get freed when the
-     * rreq gets freed. */
-    MPID_Request_release(rreq);
-
-    /* Free MPIDI_RMA_Ops_list - the lock packet should still be in place, so
-     * we have to free two elements. */
-    MPIDI_CH3I_RMA_Ops_free(win_ptr, &win_ptr->targets[target_rank].rma_ops_list,
-                            &win_ptr->targets[target_rank].rma_ops_list_tail);
-
-  fn_fail:
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_SEND_LOCK_GET);
-    return mpi_errno;
-}
-
-/* ------------------------------------------------------------------------ */
-/* list_complete_timer/counter and list_block_timer defined above */
-
-static inline int rma_list_complete(MPID_Win * win_ptr, MPIDI_RMA_Ops_list_t * ops_list,
-                                    MPIDI_RMA_Ops_list_t *ops_list_tail)
-{
-    int ntimes = 0, mpi_errno = 0;
-    MPIDI_RMA_Op_t *curr_ptr;
-    MPID_Progress_state progress_state;
-
-    MPID_Progress_start(&progress_state);
-    /* Process all operations until they are complete */
-    while (!MPIDI_CH3I_RMA_Ops_isempty(ops_list)) {
-        int nDone = 0;
-        mpi_errno = rma_list_gc(win_ptr, ops_list, ops_list_tail, NULL, &nDone);
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
-        ntimes++;
-
-        /* Wait for something to arrive */
-        /* In some tests, this hung unless the test ensured that
-         * there was an incomplete request. */
-        curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
-
-        /* MT: avoid processing unissued operations enqueued by other
-           threads in MPID_Progress_wait() */
-        if (curr_ptr && !curr_ptr->request) {
-            /* This RMA operation has not been issued yet. */
-            break;
-        }
-        if (curr_ptr && !MPID_Request_is_complete(curr_ptr->request)) {
-            mpi_errno = MPID_Progress_wait(&progress_state);
-            /* --BEGIN ERROR HANDLING-- */
-            if (mpi_errno != MPI_SUCCESS) {
-                MPID_Progress_end(&progress_state);
-                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winnoprogress");
-            }
-            /* --END ERROR HANDLING-- */
-        }
-    }   /* While list of rma operation is non-empty */
-    MPID_Progress_end(&progress_state);
-
-  fn_fail:
-    return mpi_errno;
-}
-
-/* This routine is used to do garbage collection work on completed RMA
-   requests so far. It is used to clean up the RMA requests that are
-   not completed immediately when issuing out but are completed later
-   when poking progress engine, so that they will not waste internal
-   resources.
-*/
-static inline int rma_list_gc(MPID_Win * win_ptr,
-                              MPIDI_RMA_Ops_list_t * ops_list,
-                              MPIDI_RMA_Ops_list_t * ops_list_tail,
-                              MPIDI_RMA_Op_t * last_elm, int *nDone)
-{
-    int mpi_errno = 0;
-    MPIDI_RMA_Op_t *curr_ptr;
-    int nComplete = 0;
-    int nVisit = 0;
-
-    curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
-    do {
-        /* MT: avoid processing unissued operations enqueued by other threads
-           in rma_list_complete() */
-        if (curr_ptr && !curr_ptr->request) {
-            /* This RMA operation has not been issued yet. */
-            break;
-        }
-        if (MPID_Request_is_complete(curr_ptr->request)) {
-            /* Once we find a complete request, we complete
-             * as many as possible until we find an incomplete
-             * or null request */
-            do {
-                nComplete++;
-                mpi_errno = curr_ptr->request->status.MPI_ERROR;
-                /* --BEGIN ERROR HANDLING-- */
-                if (mpi_errno != MPI_SUCCESS) {
-                    MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
-                }
-                /* --END ERROR HANDLING-- */
-                MPID_Request_release(curr_ptr->request);
-                MPIDI_CH3I_RMA_Ops_free_and_next(win_ptr, ops_list, ops_list_tail, &curr_ptr);
-                nVisit++;
-
-                /* MT: avoid processing unissued operations enqueued by other
-                   threads in rma_list_complete() */
-                if (curr_ptr && !curr_ptr->request) {
-                    /* This RMA operation has not been issued yet. */
-                    break;
-                }
-            }
-            while (curr_ptr && curr_ptr != last_elm && MPID_Request_is_complete(curr_ptr->request));
-            if ((MPIR_CVAR_CH3_RMA_GC_NUM_TESTED >= 0 &&
-                 nVisit >= MPIR_CVAR_CH3_RMA_GC_NUM_TESTED) ||
-                (MPIR_CVAR_CH3_RMA_GC_NUM_COMPLETED >= 0 &&
-                 nComplete >= MPIR_CVAR_CH3_RMA_GC_NUM_COMPLETED)) {
-                /* MPIR_CVAR_CH3_RMA_GC_NUM_TESTED: Once we tested certain
-                 * number of requests, we stop checking the rest of the
-                 * operation list and break out the loop. */
-                /* MPIR_CVAR_CH3_RMA_GC_NUM_COMPLETED: Once we found
-                 * certain number of completed requests, we stop checking
-                 * the rest of the operation list and break out the loop. */
-                break;
-            }
-        }
-        else {
-            /* proceed to the next entry.  */
-            curr_ptr = curr_ptr->next;
-            nVisit++;
-            if (MPIR_CVAR_CH3_RMA_GC_NUM_TESTED >= 0 && nVisit >= MPIR_CVAR_CH3_RMA_GC_NUM_TESTED) {
-                /* MPIR_CVAR_CH3_RMA_GC_NUM_TESTED: Once we tested certain
-                 * number of requests, we stop checking the rest of the
-                 * operation list and break out the loop. */
-                break;
-            }
-        }
-    } while (curr_ptr && curr_ptr != last_elm);
-
-    /* if (nComplete) printf("Completed %d requests\n", nComplete); */
-
-    *nDone = nComplete;
-
-  fn_fail:
-    return mpi_errno;
-}
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index 56cd909..8846284 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -282,7 +282,7 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     MPID_Comm *win_comm_ptr;
     int win_target_pool_size;
     MPIDI_RMA_Win_list_t *win_elem;
-    MPIU_CHKPMEM_DECL(5);
+    MPIU_CHKPMEM_DECL(4);
     MPIDI_STATE_DECL(MPID_STATE_WIN_INIT);
 
     MPIDI_FUNC_ENTER(MPID_STATE_WIN_INIT);
@@ -324,10 +324,6 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->shared_lock_ref_cnt = 0;
     (*win_ptr)->lock_queue = NULL;
     (*win_ptr)->lock_queue_tail = NULL;
-    (*win_ptr)->epoch_state = MPIDI_EPOCH_NONE;
-    (*win_ptr)->epoch_count = 0;
-    (*win_ptr)->at_rma_ops_list = NULL;
-    (*win_ptr)->at_rma_ops_list_tail = NULL;
     (*win_ptr)->shm_allocated = FALSE;
     (*win_ptr)->states.access_state = MPIDI_RMA_NONE;
     (*win_ptr)->states.exposure_state = MPIDI_RMA_NONE;
@@ -343,17 +339,6 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->outstanding_locks = 0;
     (*win_ptr)->outstanding_unlocks = 0;
 
-    /* Initialize the passive target lock state */
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->targets, struct MPIDI_Win_target_state *,
-                        sizeof(struct MPIDI_Win_target_state) * MPIR_Comm_size(win_comm_ptr),
-                        mpi_errno, "RMA target states array");
-
-    for (i = 0; i < MPIR_Comm_size(win_comm_ptr); i++) {
-        (*win_ptr)->targets[i].rma_ops_list = NULL;
-        (*win_ptr)->targets[i].rma_ops_list_tail = NULL;
-        (*win_ptr)->targets[i].remote_lock_state = MPIDI_CH3_WIN_LOCK_NONE;
-    }
-
     /* Initialize the info flags */
     (*win_ptr)->info_args.no_locks = 0;
     (*win_ptr)->info_args.accumulate_ordering = MPIDI_ACC_ORDER_RAR | MPIDI_ACC_ORDER_RAW |
diff --git a/src/mpid/ch3/src/mpidi_printf.c b/src/mpid/ch3/src/mpidi_printf.c
index 8e7d69d..8f15f63 100644
--- a/src/mpid/ch3/src/mpidi_printf.c
+++ b/src/mpid/ch3/src/mpidi_printf.c
@@ -143,15 +143,6 @@ void MPIDI_DBG_Print_packet(MPIDI_CH3_Pkt_t *pkt)
 	    case MPIDI_CH3_PKT_LOCK:
 		MPIDI_CH3_PktPrint_Lock( stdout, pkt );
 		break;
-	    case MPIDI_CH3_PKT_LOCK_PUT_UNLOCK:
-		MPIDI_CH3_PktPrint_LockPutUnlock( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK:
-		MPIDI_CH3_PktPrint_LockAccumUnlock( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_LOCK_GET_UNLOCK:
-		MPIDI_CH3_PktPrint_LockGetUnlock( stdout, pkt );
-		break;
 	    case MPIDI_CH3_PKT_FLUSH_ACK:
 		MPIDI_CH3_PktPrint_FlushAck( stdout, pkt );
 		break;
@@ -342,28 +333,6 @@ const char *MPIDI_Pkt_GetDescString( MPIDI_CH3_Pkt_t *pkt )
 		       "LOCK - %d", 
 		       pkt->lock.target_win_handle );
 	break;
-    case MPIDI_CH3_PKT_LOCK_PUT_UNLOCK:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "PUT_UNLOCK - (%p,%d,0x%08X)", 
-		       pkt->lock_put_unlock.addr,
-		       pkt->lock_put_unlock.count,
-		       pkt->lock_put_unlock.target_win_handle );
-	break;
-    case MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "LOCK_ACCUM_UNLOCK - (%p,%d,0x%08X)", 
-		       pkt->lock_accum_unlock.addr,
-		       pkt->lock_accum_unlock.count,
-		       pkt->lock_accum_unlock.target_win_handle );
-	break;
-    case MPIDI_CH3_PKT_LOCK_GET_UNLOCK:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "LOCK_GET_UNLOCK - (%p,%d,0x%08X) req=%d", 
-		       pkt->lock_get_unlock.addr,
-		       pkt->lock_get_unlock.count,
-		       pkt->lock_get_unlock.target_win_handle, 
-		       pkt->lock_get_unlock.request_handle );
-	break;
     case MPIDI_CH3_PKT_FLUSH_ACK:
 	/* There is no rma_done packet type */
 	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index 6e63c0f..6f056e2 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -185,7 +185,6 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
     if (mpi_errno)
         MPIU_ERR_POP(mpi_errno);
 
-    MPIU_Free((*win_ptr)->targets);
     MPIU_Free((*win_ptr)->base_addrs);
     MPIU_Free((*win_ptr)->sizes);
     MPIU_Free((*win_ptr)->disp_units);

http://git.mpich.org/mpich.git/commitdiff/0542e30407f86bda999cc307d894886bd1c563e2

commit 0542e30407f86bda999cc307d894886bd1c563e2
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Nov 2 22:49:18 2014 -0600

    Rewrite code of passive lock control messages.
    
    1. Piggyback LOCK request with first IMMED operation.
    
    When we see an IMMED operation, we can always piggyback
    LOCK request with that operation to reduce one sync
    message of single LOCK request. When packet header of
    that operation is received on target, we will try to
    acquire the lock and perform that operation. The target
    either piggybacks LOCK_GRANTED message with the response
    packet (if available), or sends a single LOCK_GRANTED
    message back to origin.
    
    2. Rewrite code of manage lock queue.
    
    When the lock request cannot be satisfied on target,
    we need to buffer that lock request on target. All we
    need to do is enqueuing the packet header, which contains
    all information we need after lock is granted. When
    the current lock is released, the runtime will goes
    over the lock queue and grant the lock to the next
    available request. After lock is granted, the runtime
    just trigger the packet handler for the second time.
    
    3. Release lock on target side if piggybacking with UNLOCK.
    
    If there are active-message operations to be issued,
    we piggyback a UNLOCK flag with the last operation.
    When the target recieves it, it will release the current
    lock and grant the lock to the next process.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 5892eab..e8869da 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -48,6 +48,7 @@ static inline MPIDI_RMA_Op_t *MPIDI_CH3I_Win_op_alloc(MPID_Win * win_ptr)
     e->dataloop = NULL;
     e->request = NULL;
     e->is_dt = 0;
+    e->piggyback_lock_candidate = 0;
 
     return e;
 }
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index 3047231..61500d7 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -81,6 +81,7 @@ typedef struct MPIDI_RMA_Op {
     MPIDI_CH3_Pkt_t pkt;
     MPIDI_RMA_Pool_type_t pool_type;
     int is_dt;
+    int piggyback_lock_candidate;
 } MPIDI_RMA_Op_t;
 
 typedef struct MPIDI_RMA_Target {
@@ -135,25 +136,9 @@ typedef struct MPIDI_RMA_Win_list {
 
 extern MPIDI_RMA_Win_list_t *MPIDI_RMA_Win_list, *MPIDI_RMA_Win_list_tail;
 
-typedef struct MPIDI_PT_single_op {
-    MPIDI_CH3_Pkt_type_t type;  /* put, get, or accum. */
-    void *addr;
-    int count;
-    MPI_Datatype datatype;
-    MPI_Op op;
-    void *data;                 /* for queued puts and accumulates, data is copied here */
-    MPI_Request request_handle; /* for gets */
-    int data_recd;              /* to indicate if the data has been received */
-    MPIDI_CH3_Pkt_flags_t flags;
-} MPIDI_PT_single_op;
-
 typedef struct MPIDI_Win_lock_queue {
     struct MPIDI_Win_lock_queue *next;
-    int lock_type;
-    MPI_Win source_win_handle;
-    int origin_rank;
-    struct MPIDI_PT_single_op *pt_single_op;    /* to store info for
-                                                 * lock-put-unlock optimization */
+    MPIDI_CH3_Pkt_t pkt;    /* all information for this request packet */
 } MPIDI_Win_lock_queue;
 
 typedef MPIDI_RMA_Op_t *MPIDI_RMA_Ops_list_t;
diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index dad5b23..9f12b38 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -120,7 +120,9 @@ typedef enum {
     MPIDI_CH3_PKT_FLAG_RMA_NOCHECK = 32,
     MPIDI_CH3_PKT_FLAG_RMA_SHARED = 64,
     MPIDI_CH3_PKT_FLAG_RMA_EXCLUSIVE = 128,
-    MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK = 256
+    MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK = 256,
+    MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK = 512,
+    MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED = 1024
 } MPIDI_CH3_Pkt_flags_t;
 
 typedef struct MPIDI_CH3_Pkt_send {
@@ -407,6 +409,8 @@ typedef struct MPIDI_CH3_Pkt_put {
                                  * with shared locks. Otherwise set to NULL*/
     char data[MPIDI_RMA_IMMED_BYTES];
     size_t immed_len;
+    int lock_type;      /* used when piggybacking LOCK message. */
+    int origin_rank;    /* used when piggybacking LOCK message. */
 } MPIDI_CH3_Pkt_put_t;
 
 typedef struct MPIDI_CH3_Pkt_get {
@@ -424,6 +428,8 @@ typedef struct MPIDI_CH3_Pkt_get {
     MPI_Win source_win_handle;  /* Used in the last RMA operation in an
                                  * epoch in the case of passive target rma
                                  * with shared locks. Otherwise set to NULL*/
+    int lock_type;   /* used when piggybacking LOCK message. */
+    int origin_rank; /* used when piggybacking LOCK message. */
 } MPIDI_CH3_Pkt_get_t;
 
 typedef struct MPIDI_CH3_Pkt_get_resp {
@@ -452,6 +458,8 @@ typedef struct MPIDI_CH3_Pkt_accum {
                                  * with shared locks. Otherwise set to NULL*/
     char data[MPIDI_RMA_IMMED_BYTES];
     size_t immed_len;
+    int lock_type;    /* used when piggybacking LOCK message. */
+    int origin_rank;  /* used when piggybacking LOCK message. */
 } MPIDI_CH3_Pkt_accum_t;
 
 typedef struct MPIDI_CH3_Pkt_get_accum {
@@ -472,6 +480,8 @@ typedef struct MPIDI_CH3_Pkt_get_accum {
                                  * with shared locks. Otherwise set to NULL*/
     char data[MPIDI_RMA_IMMED_BYTES];
     size_t immed_len;
+    int lock_type;     /* used when piggybacking LOCK message. */
+    int origin_rank;   /* used when piggybacking LOCK message. */
 } MPIDI_CH3_Pkt_get_accum_t;
 
 typedef struct MPIDI_CH3_Pkt_get_accum_resp {
@@ -515,6 +525,8 @@ typedef struct MPIDI_CH3_Pkt_cas {
                                  * in passive target rma. Otherwise set to NULL*/
     MPIDI_CH3_CAS_Immed_u origin_data;
     MPIDI_CH3_CAS_Immed_u compare_data;
+    int lock_type;     /* used when piggybacking LOCK message. */
+    int origin_rank;   /* used when piggybacking LOCK message. */
 } MPIDI_CH3_Pkt_cas_t;
 
 typedef struct MPIDI_CH3_Pkt_cas_resp {
@@ -541,6 +553,8 @@ typedef struct MPIDI_CH3_Pkt_fop {
                                  * in passive target rma. Otherwise set to NULL*/
     char data[MPIDI_RMA_IMMED_BYTES];
     int immed_len;
+    int lock_type;     /* used when piggybacking LOCK message. */
+    int origin_rank;   /* used when piggybacking LOCK message. */
 } MPIDI_CH3_Pkt_fop_t;
 
 typedef struct MPIDI_CH3_Pkt_fop_resp {
@@ -596,6 +610,7 @@ typedef struct MPIDI_CH3_Pkt_flush_ack {
     MPI_Win source_win_handle;
     int target_rank;            /* Used in flush_ack response to look up the
                                  * target state at the origin. */
+    MPIDI_CH3_Pkt_flags_t flags;
 } MPIDI_CH3_Pkt_flush_ack_t;
 
 typedef struct MPIDI_CH3_Pkt_decr_at_counter {
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index 0f522af..c46f98c 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -342,6 +342,7 @@ struct MPIDI_Win_target_state {
                               * (none, shared, exclusive) */             \
     volatile int shared_lock_ref_cnt;                                    \
     struct MPIDI_Win_lock_queue volatile *lock_queue;  /* list of unsatisfied locks */  \
+    struct MPIDI_Win_lock_queue volatile *lock_queue_tail; /* tail of unstaisfied locks. */ \
                                                                          \
     MPI_Aint *sizes;      /* array of sizes of all windows */            \
     struct MPIDI_Win_info_args info_args;                                \
@@ -474,7 +475,6 @@ typedef struct MPIDI_Request {
     MPI_Win     target_win_handle;
     MPI_Win     source_win_handle;
     MPIDI_CH3_Pkt_flags_t flags; /* flags that were included in the original RMA packet header */
-    struct MPIDI_Win_lock_queue *lock_queue_entry; /* for single lock-put-unlock optimization */
     MPI_Request resp_request_handle; /* Handle for get_accumulate response */
 
     MPIDI_REQUEST_SEQNUM
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index 9a016e4..a5c6578 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -84,6 +84,7 @@ static inline int send_unlock_msg(int dest, MPID_Win * win_ptr)
 
     MPIDI_Pkt_init(unlock_pkt, MPIDI_CH3_PKT_UNLOCK);
     unlock_pkt->target_win_handle = win_ptr->all_win_handles[dest];
+    unlock_pkt->source_win_handle = win_ptr->handle;
 
     /* Reset the local state of the target to unlocked */
     win_ptr->targets[dest].remote_lock_state = MPIDI_CH3_WIN_LOCK_NONE;
@@ -154,6 +155,7 @@ static inline int MPIDI_CH3I_Send_lock_granted_pkt(MPIDI_VC_t * vc, MPID_Win * w
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static inline int MPIDI_CH3I_Send_flush_ack_pkt(MPIDI_VC_t *vc, MPID_Win *win_ptr,
+                                                MPIDI_CH3_Pkt_flags_t flags,
                                     MPI_Win source_win_handle)
 {
     MPIDI_CH3_Pkt_t upkt;
@@ -167,6 +169,7 @@ static inline int MPIDI_CH3I_Send_flush_ack_pkt(MPIDI_VC_t *vc, MPID_Win *win_pt
     MPIDI_Pkt_init(flush_ack_pkt, MPIDI_CH3_PKT_FLUSH_ACK);
     flush_ack_pkt->source_win_handle = source_win_handle;
     flush_ack_pkt->target_rank = win_ptr->comm_ptr->rank;
+    flush_ack_pkt->flags = flags;
 
     /* Because this is in a packet handler, it is already within a critical section */	
     /* MPIU_THREAD_CS_ENTER(CH3COMM,vc); */
@@ -227,6 +230,56 @@ static inline int send_decr_at_cnt_msg(int dst, MPID_Win * win_ptr)
     /* --END ERROR HANDLING-- */
 }
 
+
+
+/* enqueue an unsatisfied origin in passive target at target side. */
+static inline int enqueue_lock_origin(MPID_Win *win_ptr, MPIDI_CH3_Pkt_t *pkt)
+{
+    MPIDI_Win_lock_queue *new_ptr = NULL;
+    int mpi_errno = MPI_SUCCESS;
+
+    new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
+    if (!new_ptr) {
+        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
+                             "MPIDI_Win_lock_queue");
+    }
+
+    new_ptr->next = NULL;
+    new_ptr->pkt = (*pkt);
+    MPL_LL_APPEND(win_ptr->lock_queue, win_ptr->lock_queue_tail, new_ptr);
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
+static inline int set_lock_sync_counter(MPID_Win *win_ptr, int target_rank)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    if (win_ptr->outstanding_locks > 0) {
+        win_ptr->outstanding_locks--;
+        MPIU_Assert(win_ptr->outstanding_locks >= 0);
+    }
+    else {
+        MPIDI_RMA_Target_t *t = NULL;
+        mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, target_rank, &t);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        MPIU_Assert(t != NULL);
+
+        t->outstanding_lock--;
+        MPIU_Assert(t->outstanding_lock == 0);
+    }
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
 #undef FUNCNAME
 #define FUNCNAME acquire_local_lock
 #undef FCNAME
@@ -237,21 +290,21 @@ static inline int acquire_local_lock(MPID_Win * win_ptr, int lock_type)
     MPIDI_STATE_DECL(MPID_STATE_ACQUIRE_LOCAL_LOCK);
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_ACQUIRE_LOCAL_LOCK);
 
-    /* poke the progress engine until the local lock is granted */
-    if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, lock_type) == 0) {
-        MPID_Progress_state progress_state;
-
-        MPID_Progress_start(&progress_state);
-        while (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, lock_type) == 0) {
-            mpi_errno = MPID_Progress_wait(&progress_state);
-            /* --BEGIN ERROR HANDLING-- */
-            if (mpi_errno != MPI_SUCCESS) {
-                MPID_Progress_end(&progress_state);
-                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winnoprogress");
-            }
-            /* --END ERROR HANDLING-- */
-        }
-        MPID_Progress_end(&progress_state);
+    if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, lock_type) == 1) {
+        mpi_errno = set_lock_sync_counter(win_ptr, win_ptr->comm_ptr->rank);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+    else {
+        /* Queue the lock information. */
+        MPIDI_CH3_Pkt_t pkt;
+        MPIDI_CH3_Pkt_lock_t *lock_pkt = &pkt.lock;
+
+        MPIDI_Pkt_init(lock_pkt, MPIDI_CH3_PKT_LOCK);
+        lock_pkt->lock_type = lock_type;
+        lock_pkt->origin_rank = win_ptr->comm_ptr->rank;
+
+        mpi_errno = enqueue_lock_origin(win_ptr, &pkt);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
     }
 
     win_ptr->targets[win_ptr->comm_ptr->rank].remote_lock_state = MPIDI_CH3_WIN_LOCK_GRANTED;
@@ -409,6 +462,41 @@ static inline int do_accumulate_op(MPID_Request *rreq)
     goto fn_exit;
 }
 
+
+static inline int check_piggyback_lock(MPID_Win *win_ptr, MPIDI_CH3_Pkt_t *pkt, int *acquire_lock_fail) {
+    int lock_type;
+    MPIDI_CH3_Pkt_flags_t flags;
+    int mpi_errno = MPI_SUCCESS;
+
+    (*acquire_lock_fail) = 0;
+
+    MPIDI_CH3_PKT_RMA_GET_FLAGS((*pkt), flags, mpi_errno);
+    MPIDI_CH3_PKT_RMA_GET_LOCK_TYPE((*pkt), lock_type, mpi_errno);
+
+    if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK) {
+        if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, lock_type) == 0) {
+
+            /* cannot acquire the lock, queue up this operation. */
+            mpi_errno = enqueue_lock_origin(win_ptr, pkt);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+            (*acquire_lock_fail) = 1;
+        }
+        else {
+            /* unset LOCK flag */
+            MPIDI_CH3_PKT_RMA_UNSET_FLAG((*pkt), MPIDI_CH3_PKT_FLAG_RMA_LOCK, mpi_errno);
+            /* set LOCK_GRANTED flag */
+            MPIDI_CH3_PKT_RMA_SET_FLAG((*pkt), MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED, mpi_errno);
+        }
+    }
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
 static inline int wait_progress_engine(void)
 {
     int mpi_errno = MPI_SUCCESS;
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index bae961b..1eb93e2 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -88,8 +88,17 @@ int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *vc,
 
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED) {
+        if (!(rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) &&
+            !(rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)) {
+            mpi_errno = MPIDI_CH3I_Send_lock_granted_pkt(vc, win_ptr, rreq->dev.source_win_handle);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            MPIDI_CH3_Progress_signal_completion();
+        }
+    }
     if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) {
-        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, rreq->dev.source_win_handle);
+        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, rreq->dev.flags,
+                                                  rreq->dev.source_win_handle);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIDI_CH3_Progress_signal_completion();
     }
@@ -100,7 +109,14 @@ int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *vc,
         if (win_ptr->at_completion_counter == 0)
             MPIDI_CH3_Progress_signal_completion();
     }
-
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
+        mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, rreq->dev.flags,
+                                                  rreq->dev.source_win_handle);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        MPIDI_CH3_Progress_signal_completion();
+    }
     mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
                                                rreq->dev.source_win_handle);
     if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
@@ -147,8 +163,17 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *vc,
         MPIU_ERR_POP(mpi_errno);
     }
 
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED) {
+        if (!(rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) &&
+            !(rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)) {
+            mpi_errno = MPIDI_CH3I_Send_lock_granted_pkt(vc, win_ptr, rreq->dev.source_win_handle);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            MPIDI_CH3_Progress_signal_completion();
+        }
+    }
     if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) {
-        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, rreq->dev.source_win_handle);
+        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, rreq->dev.flags,
+                                                  rreq->dev.source_win_handle);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIDI_CH3_Progress_signal_completion();
     }
@@ -159,6 +184,13 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *vc,
         if (win_ptr->at_completion_counter == 0)
             MPIDI_CH3_Progress_signal_completion();
     }
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
+        mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, rreq->dev.flags,
+                                                  rreq->dev.source_win_handle);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
 
     mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
                                                rreq->dev.source_win_handle);
@@ -205,8 +237,12 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
     get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
     get_accum_resp_pkt->source_win_handle = rreq->dev.source_win_handle;
     get_accum_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED)
+        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
     if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
         get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)
+        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK;
 
     MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
 
@@ -522,8 +558,12 @@ int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *vc,
     get_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
     get_resp_pkt->source_win_handle = rreq->dev.source_win_handle;
     get_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED)
+        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
     if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
         get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)
+        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK;
     
     sreq->dev.segment_ptr = MPID_Segment_alloc( );
     MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
@@ -832,6 +872,82 @@ static int create_derived_datatype(MPID_Request *req, MPID_Datatype **dtp)
     return mpi_errno;
 }
 
+
+static inline int perform_op_in_lock_queue(MPID_Win *win_ptr, MPIDI_Win_lock_queue *lock_entry)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPID_Request *req = NULL;
+    MPIDI_msg_sz_t len = sizeof(MPIDI_CH3_Pkt_t);
+    MPIDI_VC_t *vc = NULL;
+    int origin_rank;
+    static MPIDI_CH3_PktHandler_Fcn *pktArray[MPIDI_CH3_PKT_END_ALL+1];
+    static int needsInit = 1;
+
+    if (lock_entry->pkt.type == MPIDI_CH3_PKT_LOCK) {
+
+        /* single LOCK request */
+
+        MPIDI_CH3_Pkt_lock_t *lock_pkt = &(lock_entry->pkt.lock);
+        if (lock_pkt->origin_rank == win_ptr->comm_ptr->rank) {
+            if (win_ptr->outstanding_locks > 0) {
+                win_ptr->outstanding_locks--;
+                MPIU_Assert(win_ptr->outstanding_locks >= 0);
+            }
+            else {
+                MPIDI_RMA_Target_t *t = NULL;
+                mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr,
+                                                       win_ptr->comm_ptr->rank, &t);
+                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                MPIU_Assert(t != NULL);
+                t->outstanding_lock--;
+                MPIU_Assert(t->outstanding_lock == 0);
+            }
+        }
+        else {
+            MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr,
+                                         lock_pkt->origin_rank, &vc);
+            mpi_errno = MPIDI_CH3I_Send_lock_granted_pkt(vc, win_ptr,
+                                              lock_pkt->source_win_handle);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
+    }
+    else {
+        /* LOCK+OP packet */
+
+        /* get VC */
+        MPIDI_CH3_PKT_RMA_GET_ORIGIN_RANK(lock_entry->pkt, origin_rank, mpi_errno);
+        MPIDI_Comm_get_vc(win_ptr->comm_ptr, origin_rank, &vc);
+
+        /* unset LOCK flag */
+        MPIDI_CH3_PKT_RMA_UNSET_FLAG(lock_entry->pkt, MPIDI_CH3_PKT_FLAG_RMA_LOCK, mpi_errno);
+
+        /* set LOCK_GRANTED flag */
+        MPIDI_CH3_PKT_RMA_SET_FLAG(lock_entry->pkt, MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED, mpi_errno);
+
+        if (needsInit) {
+            mpi_errno = MPIDI_CH3_PktHandler_Init(pktArray, MPIDI_CH3_PKT_END_CH3);
+            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            needsInit = 0;
+        }
+
+        /* invalid pkt data will result in unpredictable behavior */
+        MPIU_Assert((lock_entry->pkt).type >= MPIDI_CH3_PKT_PUT && (lock_entry->pkt).type <= MPIDI_CH3_PKT_CAS);
+
+        /* trigger packet handler to deal with this op. */
+        mpi_errno = pktArray[lock_entry->pkt.type](vc, &(lock_entry->pkt), &len, &req);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        MPIU_Assert(len == sizeof(MPIDI_CH3_Pkt_t));
+        MPIU_Assert(req == NULL);
+    }
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
 static int entered_flag = 0;
 static int entered_count = 0;
 
@@ -843,10 +959,10 @@ static int entered_count = 0;
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_CH3I_Release_lock(MPID_Win *win_ptr)
 {
-    MPIDI_Win_lock_queue *lock_queue, **lock_queue_ptr;
+    MPIDI_Win_lock_queue *lock_entry, *lock_entry_next;
     int requested_lock, mpi_errno = MPI_SUCCESS, temp_entered_count;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_RELEASE_LOCK);
-    
+
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_RELEASE_LOCK);
 
     if (win_ptr->current_lock_type == MPI_LOCK_SHARED) {
@@ -860,159 +976,57 @@ int MPIDI_CH3I_Release_lock(MPID_Win *win_ptr)
     if (win_ptr->shared_lock_ref_cnt == 0) {
 
 	/* This function needs to be reentrant even in the single-threaded case
-           because when going through the lock queue, the do_simple_get 
-	   called in the 
-	   lock-get-unlock case may itself cause a request to complete, and 
-	   this function
-           may again get called in the completion action in 
-	   ch3u_handle_send_req.c. To
-           handle this possibility, we use an entered_flag. If the flag is 
-	   not 0, we simply
-	   increment the entered_count and return. The loop through the lock 
-	   queue is repeated 
-	   if the entered_count has changed while we are in the loop.
+           because when going through the lock queue, pkt_handler() in
+           perform_op_in_lock_queue() may again call release_lock(). To handle
+           this possibility, we use an entered_flag.
+           If the flag is not 0, we simply increment the entered_count and return.
+           The loop through the lock queue is repeated if the entered_count has
+           changed while we are in the loop.
 	 */
 	if (entered_flag != 0) {
-	    entered_count++;
+	    entered_count++; /* Count how many times we re-enter */
 	    goto fn_exit;
 	}
-	else {
-	    entered_flag = 1;
-	    temp_entered_count = entered_count;
-	}
 
-	do { 
+        entered_flag = 1;  /* Mark that we are now entering release_lock() */
+        temp_entered_count = entered_count;
+
+	do {
 	    if (temp_entered_count != entered_count) temp_entered_count++;
 
 	    /* FIXME: MT: The setting of the lock type must be done atomically */
 	    win_ptr->current_lock_type = MPID_LOCK_NONE;
-	    
+
 	    /* If there is a lock queue, try to satisfy as many lock requests as 
 	       possible. If the first one is a shared lock, grant it and grant all 
 	       other shared locks. If the first one is an exclusive lock, grant 
 	       only that one. */
-	    
+
 	    /* FIXME: MT: All queue accesses need to be made atomic */
-	    lock_queue = (MPIDI_Win_lock_queue *) win_ptr->lock_queue;
-	    lock_queue_ptr = (MPIDI_Win_lock_queue **) &(win_ptr->lock_queue);
-	    while (lock_queue) {
-		/* if it is not a lock-op-unlock type case or if it is a 
-		   lock-op-unlock type case but all the data has been received, 
-		   try to acquire the lock */
-		if ((lock_queue->pt_single_op == NULL) || 
-		    (lock_queue->pt_single_op->data_recd == 1)) {
-		    
-		    requested_lock = lock_queue->lock_type;
-		    if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, requested_lock) 
-			== 1) {
-			
-			if (lock_queue->pt_single_op != NULL) {
-			    /* single op. do it here */
-			    MPIDI_PT_single_op * single_op;
-			    
-			    single_op = lock_queue->pt_single_op;
-			    if (single_op->type == MPIDI_CH3_PKT_LOCK_PUT_UNLOCK) {
-				mpi_errno = MPIR_Localcopy(single_op->data,
-							   single_op->count,
-							   single_op->datatype,
-							   single_op->addr,
-							   single_op->count,
-							   single_op->datatype);
-			    }   
-			    else if (single_op->type == MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK) {
-				if (win_ptr->shm_allocated == TRUE)
-				    MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-				mpi_errno = do_simple_accumulate(single_op);
-				if (win_ptr->shm_allocated == TRUE)
-				    MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-			    }
-			    else if (single_op->type == MPIDI_CH3_PKT_LOCK_GET_UNLOCK) {
-				mpi_errno = do_simple_get(win_ptr, lock_queue);
-			    }
-			    
-                            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-			    
-			    /* if put or accumulate, send rma done packet and release lock. */
-			    if (single_op->type != MPIDI_CH3_PKT_LOCK_GET_UNLOCK) {
-                                /* NOTE: Only *queued* single_op operations are completed here.
-                                   Lock-op-unlock/single_op RMA ops can also be completed as
-                                   they arrive within various packet/request handlers via
-                                   MPIDI_CH3_Finish_rma_op_target().  That call cannot be used
-                                   here, because it would enter this function recursively. */
-                                MPIDI_VC_t *vc;
-                                MPIDI_Comm_get_vc(win_ptr->comm_ptr, lock_queue->origin_rank, &vc);
-				mpi_errno = 
-                                    MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr,
-								    lock_queue->source_win_handle);
-                                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-				
-				/* release the lock */
-				if (win_ptr->current_lock_type == MPI_LOCK_SHARED) {
-				    /* decr ref cnt */
-				    /* FIXME: MT: Must be done atomically */
-				    win_ptr->shared_lock_ref_cnt--;
-				}
-				
-				/* If shared lock ref count is 0 
-				   (which is also true if the lock is an
-				   exclusive lock), release the lock. */
-				if (win_ptr->shared_lock_ref_cnt == 0) {
-				    /* FIXME: MT: The setting of the lock type 
-				       must be done atomically */
-				    win_ptr->current_lock_type = MPID_LOCK_NONE;
-				}
-				
-				/* dequeue entry from lock queue */
-				MPIU_Free(single_op->data);
-				MPIU_Free(single_op);
-				*lock_queue_ptr = lock_queue->next;
-				MPIU_Free(lock_queue);
-				lock_queue = *lock_queue_ptr;
-			    }
-			    
-			    else {
-				/* it's a get. The operation is not complete. It 
-				   will be completed in ch3u_handle_send_req.c. 
-				   Free the single_op structure. If it's an 
-				   exclusive lock, break. Otherwise continue to the
-				   next operation. */
-				
-				MPIU_Free(single_op);
-				*lock_queue_ptr = lock_queue->next;
-				MPIU_Free(lock_queue);
-				lock_queue = *lock_queue_ptr;
-				
-				if (requested_lock == MPI_LOCK_EXCLUSIVE)
-				    break;
-			    }
-			}
-			
-			else {
-			    /* send lock granted packet. */
-                            MPIDI_VC_t *vc;
-                            MPIDI_Comm_get_vc(win_ptr->comm_ptr, lock_queue->origin_rank, &vc);
-			    mpi_errno = 
-                                MPIDI_CH3I_Send_lock_granted_pkt(vc, win_ptr,
-								 lock_queue->source_win_handle);
-			    
-			    /* dequeue entry from lock queue */
-			    *lock_queue_ptr = lock_queue->next;
-			    MPIU_Free(lock_queue);
-			    lock_queue = *lock_queue_ptr;
+            lock_entry = (MPIDI_Win_lock_queue *) win_ptr->lock_queue;
+            while (lock_entry) {
+                lock_entry_next = lock_entry->next;
+
+                MPIDI_CH3_PKT_RMA_GET_LOCK_TYPE(lock_entry->pkt, requested_lock, mpi_errno);
+                if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, requested_lock) == 1) {
+                    /* perform this OP */
+
+                    mpi_errno = perform_op_in_lock_queue(win_ptr, lock_entry);
+                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 			    
-			    /* if the granted lock is exclusive, 
-			       no need to continue */
-			    if (requested_lock == MPI_LOCK_EXCLUSIVE)
-				break;
-			}
-		    }
-		}
-		else {
-		    lock_queue_ptr = &(lock_queue->next);
-		    lock_queue = lock_queue->next;
-		}
+                    /* dequeue entry from lock queue */
+                    MPL_LL_DELETE(win_ptr->lock_queue, win_ptr->lock_queue_tail, lock_entry);
+                    MPIU_Free(lock_entry);
+
+                    /* if the granted lock is exclusive,
+                       no need to continue */
+                    if (requested_lock == MPI_LOCK_EXCLUSIVE)
+                        break;
+                }
+                lock_entry = lock_entry_next;
 	    }
 	} while (temp_entered_count != entered_count);
+
 	entered_count = entered_flag = 0;
     }
 
diff --git a/src/mpid/ch3/src/ch3u_handle_send_req.c b/src/mpid/ch3/src/ch3u_handle_send_req.c
index c3ede3f..281be1c 100644
--- a/src/mpid/ch3/src/ch3u_handle_send_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_send_req.c
@@ -52,6 +52,11 @@ int MPIDI_CH3_ReqHandler_GetSendComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
 
     MPID_Win_get_ptr(sreq->dev.target_win_handle, win_ptr);
 
+    if (sreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
+        mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        MPIDI_CH3_Progress_signal_completion();
+    }
     if (sreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER) {
         win_ptr->at_completion_counter--;
         MPIU_Assert(win_ptr->at_completion_counter >= 0);
@@ -104,6 +109,12 @@ int MPIDI_CH3_ReqHandler_GaccumLikeSendComplete( MPIDI_VC_t *vc,
     win_ptr->at_completion_counter--;
     MPIU_Assert(win_ptr->at_completion_counter >= 0);
 
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
+        mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        MPIDI_CH3_Progress_signal_completion();
+    }
+
     if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER) {
         win_ptr->at_completion_counter--;
         MPIU_Assert(win_ptr->at_completion_counter >= 0);
diff --git a/src/mpid/ch3/src/ch3u_request.c b/src/mpid/ch3/src/ch3u_request.c
index e229f2f..08a71da 100644
--- a/src/mpid/ch3/src/ch3u_request.c
+++ b/src/mpid/ch3/src/ch3u_request.c
@@ -82,7 +82,6 @@ MPID_Request * MPID_Request_create(void)
 	   request for RMA operations */
 	req->dev.target_win_handle = MPI_WIN_NULL;
 	req->dev.source_win_handle = MPI_WIN_NULL;
-	req->dev.lock_queue_entry  = NULL;
 	req->dev.dtype_info	   = NULL;
 	req->dev.dataloop	   = NULL;
 	req->dev.iov_offset        = 0;
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 377eb67..5a0b415 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -110,6 +110,7 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
         put_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
         put_pkt->source_win_handle = win_ptr->handle;
         put_pkt->immed_len = 0;
+        put_pkt->origin_rank = rank;
 
         /* FIXME: For contig and very short operations, use a streamlined op */
         new_ptr->origin_addr = (void *) origin_addr;
@@ -153,6 +154,11 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
                 /* copy data from origin buffer to immed area in packet header */
                 mpi_errno = immed_copy(src, dest, put_pkt->immed_len);
                 if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+                /* If all data is in pkt header, mark this op as a candidate
+                   for piggybacking LOCK. */
+                if (put_pkt->immed_len == len)
+                    new_ptr->piggyback_lock_candidate = 1;
             }
         }
 
@@ -268,6 +274,7 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
         get_pkt->dataloop_size = 0;
         get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
         get_pkt->source_win_handle = win_ptr->handle;
+        get_pkt->origin_rank = rank;
 
         /* FIXME: For contig and very short operations, use a streamlined op */
         new_ptr->origin_addr = origin_addr;
@@ -292,6 +299,10 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
             new_ptr->is_dt = 1;
         }
 
+        if (!new_ptr->is_dt) {
+            new_ptr->piggyback_lock_candidate = 1;
+        }
+
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
@@ -442,6 +453,7 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
         accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
         accum_pkt->source_win_handle = win_ptr->handle;
         accum_pkt->immed_len = 0;
+        accum_pkt->origin_rank = rank;
 
         new_ptr->origin_addr = (void *) origin_addr;
         new_ptr->origin_count = origin_count;
@@ -484,6 +496,11 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                 /* copy data from origin buffer to immed area in packet header */
                 mpi_errno = immed_copy(src, dest, accum_pkt->immed_len);
                 if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+                /* If all data is in pkt header, mark this op as
+                   a candidate for piggybacking LOCK. */
+                if (accum_pkt->immed_len == len)
+                    new_ptr->piggyback_lock_candidate = 1;
             }
         }
 
@@ -607,6 +624,7 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
             get_pkt->dataloop_size = 0;
             get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
             get_pkt->source_win_handle = win_ptr->handle;
+            get_pkt->origin_rank = rank;
 
             new_ptr->origin_addr = result_addr;
             new_ptr->origin_count = result_count;
@@ -623,6 +641,10 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
                 MPID_Datatype_add_ref(dtp);
                 new_ptr->is_dt = 1;
             }
+
+            if (!new_ptr->is_dt) {
+                new_ptr->piggyback_lock_candidate = 1;
+            }
         }
 
         else {
@@ -637,6 +659,7 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
             get_accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
             get_accum_pkt->source_win_handle = win_ptr->handle;
             get_accum_pkt->immed_len = 0;
+            get_accum_pkt->origin_rank = rank;
 
             new_ptr->origin_addr = (void *) origin_addr;
             new_ptr->origin_count = origin_count;
@@ -683,6 +706,11 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
                     /* copy data from origin buffer to immed area in packet header */
                     mpi_errno = immed_copy(src, dest, get_accum_pkt->immed_len);
                     if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+                    /* If all data is in pkt header, mark this op as a candidate
+                       for piggybacking LOCK. */
+                    if (get_accum_pkt->immed_len == len)
+                        new_ptr->piggyback_lock_candidate = 1;
                 }
             }
         }
@@ -797,6 +825,7 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         cas_pkt->datatype = datatype;
         cas_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
         cas_pkt->source_win_handle = win_ptr->handle;
+        cas_pkt->origin_rank = rank;
 
         new_ptr->origin_addr = (void *) origin_addr;
         new_ptr->origin_count = 1;
@@ -806,6 +835,7 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         new_ptr->compare_addr = (void *) compare_addr;
         new_ptr->compare_datatype = datatype;
         new_ptr->target_rank = target_rank;
+        new_ptr->piggyback_lock_candidate = 1; /* CAS is always able to piggyback LOCK */
 
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
         if (mpi_errno)
@@ -918,11 +948,13 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             get_pkt->dataloop_size = 0;
             get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
             get_pkt->source_win_handle = win_ptr->handle;
+            get_pkt->origin_rank = rank;
 
             new_ptr->origin_addr = result_addr;
             new_ptr->origin_count = 1;
             new_ptr->origin_datatype = datatype;
             new_ptr->target_rank = target_rank;
+            new_ptr->piggyback_lock_candidate = 1;
         }
         else {
             MPIDI_CH3_Pkt_fop_t *fop_pkt = &(new_ptr->pkt.fop);
@@ -937,6 +969,7 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             fop_pkt->source_win_handle = win_ptr->handle;
             fop_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
             fop_pkt->immed_len = 0;
+            fop_pkt->origin_rank = rank;
 
             new_ptr->origin_addr = (void *) origin_addr;
             new_ptr->origin_count = 1;
@@ -944,6 +977,7 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             new_ptr->result_addr = result_addr;
             new_ptr->result_datatype = datatype;
             new_ptr->target_rank = target_rank;
+            new_ptr->piggyback_lock_candidate = 1;
 
             MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
             /* length of origin data */
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 4712f4d..2829d93 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -29,6 +29,7 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     char *data_buf = NULL;
     MPIDI_msg_sz_t data_len;
     MPID_Win *win_ptr;
+    int acquire_lock_fail = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_PUT);
 
@@ -43,6 +44,15 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
     data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
+    mpi_errno = check_piggyback_lock(win_ptr, pkt, &acquire_lock_fail);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    if (acquire_lock_fail) {
+        (*buflen) = sizeof(MPIDI_CH3_Pkt_t);
+        (*rreqp) = NULL;
+        goto fn_exit;
+    }
+
     req = MPID_Request_create();
     MPIU_Object_set_ref(req, 1);
 
@@ -187,6 +197,7 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPID_Win *win_ptr;
     int mpi_errno = MPI_SUCCESS;
     MPI_Aint type_size;
+    int acquire_lock_fail = 0;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_GET);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_GET);
@@ -197,6 +208,15 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPID_Win_get_ptr(get_pkt->target_win_handle, win_ptr);
     mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, get_pkt->flags);
 
+    mpi_errno = check_piggyback_lock(win_ptr, pkt, &acquire_lock_fail);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    if (acquire_lock_fail) {
+        (*buflen) = sizeof(MPIDI_CH3_Pkt_t);
+        (*rreqp) = NULL;
+        goto fn_exit;
+    }
+
     data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
     data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
@@ -222,8 +242,12 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
         get_resp_pkt->request_handle = get_pkt->request_handle;
         get_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+        if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED)
+            get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
         if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
             get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+        if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)
+            get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK;
         get_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
         get_resp_pkt->source_win_handle = get_pkt->source_win_handle;
 
@@ -303,10 +327,11 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         }
 
     }
-  fn_fail:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_GET);
     return mpi_errno;
-
+  fn_fail:
+    goto fn_exit;
 }
 
 #undef FUNCNAME
@@ -324,6 +349,7 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     char *data_buf = NULL;
     MPIDI_msg_sz_t data_len;
     MPID_Win *win_ptr;
+    int acquire_lock_fail = 0;
     int mpi_errno = MPI_SUCCESS;
     MPI_Aint type_size;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE);
@@ -336,6 +362,15 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPID_Win_get_ptr(accum_pkt->target_win_handle, win_ptr);
     mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, accum_pkt->flags);
 
+    mpi_errno = check_piggyback_lock(win_ptr, pkt, &acquire_lock_fail);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    if (acquire_lock_fail) {
+        (*buflen) = sizeof(MPIDI_CH3_Pkt_t);
+        (*rreqp) = NULL;
+        goto fn_exit;
+    }
+
     data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
     data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
@@ -490,6 +525,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     char *data_buf = NULL;
     MPIDI_msg_sz_t data_len;
     MPID_Win *win_ptr;
+    int acquire_lock_fail = 0;
     int mpi_errno = MPI_SUCCESS;
     MPI_Aint type_size;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_GETACCUMULATE);
@@ -502,6 +538,15 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPID_Win_get_ptr(get_accum_pkt->target_win_handle, win_ptr);
     mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, get_accum_pkt->flags);
 
+    mpi_errno = check_piggyback_lock(win_ptr, pkt, &acquire_lock_fail);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    if (acquire_lock_fail) {
+        (*buflen) = sizeof(MPIDI_CH3_Pkt_t);
+        (*rreqp) = NULL;
+        goto fn_exit;
+    }
+
     data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
     data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
@@ -733,6 +778,7 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPID_Win *win_ptr;
     MPID_Request *req;
     MPI_Aint len;
+    int acquire_lock_fail = 0;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_CAS);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_CAS);
@@ -748,13 +794,23 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
     *rreqp = NULL;
 
+    mpi_errno = check_piggyback_lock(win_ptr, pkt, &acquire_lock_fail);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    if (acquire_lock_fail)
+        goto fn_exit;
+
     MPIDI_Pkt_init(cas_resp_pkt, MPIDI_CH3_PKT_CAS_RESP);
     cas_resp_pkt->request_handle = cas_pkt->request_handle;
     cas_resp_pkt->source_win_handle = cas_pkt->source_win_handle;
     cas_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
     cas_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+    if (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED)
+        cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
     if (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
         cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+    if (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)
+        cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK;
 
     /* Copy old value into the response packet */
     MPID_Datatype_get_size_macro(cas_pkt->datatype, len);
@@ -799,6 +855,11 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             MPID_Request_release(req);
     }
 
+    if (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
+        mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        MPIDI_CH3_Progress_signal_completion();
+    }
     if (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER) {
         win_ptr->at_completion_counter--;
         MPIU_Assert(win_ptr->at_completion_counter >= 0);
@@ -837,6 +898,7 @@ int MPIDI_CH3_PktHandler_CASResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPID_Request *req;
     MPI_Aint len;
     MPID_Win *win_ptr;
+    int target_rank = cas_resp_pkt->target_rank;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_CASRESP);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_CASRESP);
@@ -846,8 +908,15 @@ int MPIDI_CH3_PktHandler_CASResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPID_Win_get_ptr(cas_resp_pkt->source_win_handle, win_ptr);
 
     /* decrement ack_counter on this target */
+    if (cas_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED) {
+        mpi_errno = set_lock_sync_counter(win_ptr, target_rank);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
     if (cas_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
-        int target_rank = cas_resp_pkt->target_rank;
+        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+    if (cas_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK) {
         mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
     }
@@ -881,6 +950,7 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_fop_resp_t *fop_resp_pkt = &upkt.fop_resp;
     MPID_Request *resp_req = NULL;
+    int acquire_lock_fail = 0;
     MPID_Win *win_ptr = NULL;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOP);
 
@@ -893,13 +963,24 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     (*buflen) = sizeof(MPIDI_CH3_Pkt_t);
     (*rreqp) = NULL;
 
+    mpi_errno = check_piggyback_lock(win_ptr, pkt, &acquire_lock_fail);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    if (acquire_lock_fail) {
+        goto fn_exit;
+    }
+
     MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP);
     fop_resp_pkt->request_handle = fop_pkt->request_handle;
     fop_resp_pkt->source_win_handle = fop_pkt->source_win_handle;
     fop_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
     fop_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED)
+        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
     if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
         fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK)
+        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK;
     fop_resp_pkt->immed_len = fop_pkt->immed_len;
 
     /* copy data to resp pkt header */
@@ -979,6 +1060,7 @@ int MPIDI_CH3_PktHandler_FOPResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPIDI_CH3_Pkt_fop_resp_t *fop_resp_pkt = &pkt->fop_resp;
     MPID_Request *req = NULL;
     MPID_Win *win_ptr = NULL;
+    int target_rank = fop_resp_pkt->target_rank;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOPRESP);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOPRESP);
@@ -992,8 +1074,15 @@ int MPIDI_CH3_PktHandler_FOPResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPIU_Memcpy(req->dev.user_buf, fop_resp_pkt->data, fop_resp_pkt->immed_len);
 
     /* decrement ack_counter */
+    if (fop_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED) {
+        mpi_errno = set_lock_sync_counter(win_ptr, target_rank);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
     if (fop_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
-        int target_rank = fop_resp_pkt->target_rank;
+        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+    if (fop_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK) {
         mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     }
@@ -1027,6 +1116,7 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     int mpi_errno = MPI_SUCCESS;
     MPI_Aint type_size;
     MPID_Win *win_ptr;
+    int target_rank = get_accum_resp_pkt->target_rank;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_GET_ACCUM_RESP);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_GET_ACCUM_RESP);
@@ -1036,8 +1126,15 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPID_Win_get_ptr(get_accum_resp_pkt->source_win_handle, win_ptr);
 
     /* decrement ack_counter on target */
+    if (get_accum_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED) {
+        mpi_errno = set_lock_sync_counter(win_ptr, target_rank);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
     if (get_accum_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
-        int target_rank = get_accum_resp_pkt->target_rank;
+        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+    if (get_accum_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK) {
         mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     }
@@ -1096,37 +1193,8 @@ int MPIDI_CH3_PktHandler_Lock(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     }
 
     else {
-        /* queue the lock information */
-        MPIDI_Win_lock_queue *curr_ptr, *prev_ptr, *new_ptr;
-
-        /* Note: This code is reached by the fechandadd rma tests */
-        /* FIXME: MT: This may need to be done atomically. */
-
-        /* FIXME: Since we need to add to the tail of the list,
-         * we should maintain a tail pointer rather than traversing the
-         * list each time to find the tail. */
-        curr_ptr = (MPIDI_Win_lock_queue *) win_ptr->lock_queue;
-        prev_ptr = curr_ptr;
-        while (curr_ptr != NULL) {
-            prev_ptr = curr_ptr;
-            curr_ptr = curr_ptr->next;
-        }
-
-        new_ptr = (MPIDI_Win_lock_queue *) MPIU_Malloc(sizeof(MPIDI_Win_lock_queue));
-        if (!new_ptr) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                 "MPIDI_Win_lock_queue");
-        }
-        if (prev_ptr != NULL)
-            prev_ptr->next = new_ptr;
-        else
-            win_ptr->lock_queue = new_ptr;
-
-        new_ptr->next = NULL;
-        new_ptr->lock_type = lock_pkt->lock_type;
-        new_ptr->source_win_handle = lock_pkt->source_win_handle;
-        new_ptr->origin_rank = lock_pkt->origin_rank;
-        new_ptr->pt_single_op = NULL;
+        mpi_errno = enqueue_lock_origin(win_ptr, pkt);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
     }
 
     *rreqp = NULL;
@@ -1514,6 +1582,7 @@ int MPIDI_CH3_PktHandler_GetResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     int mpi_errno = MPI_SUCCESS;
     MPI_Aint type_size;
     MPID_Win *win_ptr;
+    int target_rank = get_resp_pkt->target_rank;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_GETRESP);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_GETRESP);
@@ -1523,8 +1592,15 @@ int MPIDI_CH3_PktHandler_GetResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPID_Win_get_ptr(get_resp_pkt->source_win_handle, win_ptr);
 
     /* decrement ack_counter on target */
+    if (get_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED) {
+        mpi_errno = set_lock_sync_counter(win_ptr, target_rank);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
     if (get_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
-        int target_rank = get_resp_pkt->target_rank;
+        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+    if (get_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_ACK) {
         mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     }
@@ -1564,6 +1640,8 @@ int MPIDI_CH3_PktHandler_LockGranted(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 {
     MPIDI_CH3_Pkt_lock_granted_t *lock_granted_pkt = &pkt->lock_granted;
     MPID_Win *win_ptr = NULL;
+    int target_rank = lock_granted_pkt->target_rank;
+    int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKGRANTED);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKGRANTED);
@@ -1576,11 +1654,17 @@ int MPIDI_CH3_PktHandler_LockGranted(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     /* set the remote_lock_state flag in the window */
     win_ptr->targets[lock_granted_pkt->target_rank].remote_lock_state = MPIDI_CH3_WIN_LOCK_GRANTED;
 
+    mpi_errno = set_lock_sync_counter(win_ptr, target_rank);
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
     *rreqp = NULL;
     MPIDI_CH3_Progress_signal_completion();
 
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKGRANTED);
+ fn_exit:
     return MPI_SUCCESS;
+ fn_fail:
+    goto fn_exit;
 }
 
 #undef FUNCNAME
@@ -1604,6 +1688,11 @@ int MPIDI_CH3_PktHandler_FlushAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPID_Win_get_ptr(flush_ack_pkt->source_win_handle, win_ptr);
 
+    if (flush_ack_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED) {
+        mpi_errno = set_lock_sync_counter(win_ptr, target_rank);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+
     /* decrement ack_counter on target */
     mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
@@ -1684,6 +1773,10 @@ int MPIDI_CH3_PktHandler_Unlock(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
     MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
 
+    mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, MPIDI_CH3_PKT_FLAG_NONE,
+                                              unlock_pkt->source_win_handle);
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
     MPIDI_CH3_Progress_signal_completion();
 
   fn_exit:
@@ -1716,7 +1809,8 @@ int MPIDI_CH3_PktHandler_Flush(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPID_Win_get_ptr(flush_pkt->target_win_handle, win_ptr);
 
-    mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, flush_pkt->source_win_handle);
+    mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, MPIDI_CH3_PKT_FLAG_NONE,
+                                              flush_pkt->source_win_handle);
     if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
     /* This is a flush request packet */
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index e9c1ee4..56cd909 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -323,6 +323,7 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->current_lock_type = MPID_LOCK_NONE;
     (*win_ptr)->shared_lock_ref_cnt = 0;
     (*win_ptr)->lock_queue = NULL;
+    (*win_ptr)->lock_queue_tail = NULL;
     (*win_ptr)->epoch_state = MPIDI_EPOCH_NONE;
     (*win_ptr)->epoch_count = 0;
     (*win_ptr)->at_rma_ops_list = NULL;

http://git.mpich.org/mpich.git/commitdiff/7fbe72dd24c859380c77b5c58b63581f78188ec2

commit 7fbe72dd24c859380c77b5c58b63581f78188ec2
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Oct 28 19:43:15 2014 -0500

    Reset the start of the enum to 0.
    
    We must make the initial value of enum to zero because some places
    check number of packet types by checking ending type value.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index 4685e23..dad5b23 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -62,7 +62,7 @@ typedef union {
 /* FIXME: Having predefined names makes it harder to add new message types,
    such as different RMA types. */
 typedef enum {
-    MPIDI_CH3_PKT_EAGER_SEND = 42,
+    MPIDI_CH3_PKT_EAGER_SEND = 0,
 #if defined(USE_EAGER_SHORT)
     MPIDI_CH3_PKT_EAGERSHORT_SEND,
 #endif /* defined(USE_EAGER_SHORT) */

http://git.mpich.org/mpich.git/commitdiff/be3e5bdd9bb8ff1f2aac42074f5b361c3025f50a

commit be3e5bdd9bb8ff1f2aac42074f5b361c3025f50a
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Oct 28 19:43:15 2014 -0500

    Rearrange enum of pkt types.
    
    Rearrange the ordering of packet types so that all RMA issuing types
    can be placed together. This is convenient when we check if currently
    involved packets are all RMA packets.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index 8f1fcf8..4685e23 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -61,9 +61,6 @@ typedef union {
  */
 /* FIXME: Having predefined names makes it harder to add new message types,
    such as different RMA types. */
-/* We start with an arbitrarily chosen number (42), to help with
- * debugging when a packet type is not initialized or wrongly
- * initialized. */
 typedef enum {
     MPIDI_CH3_PKT_EAGER_SEND = 42,
 #if defined(USE_EAGER_SHORT)
@@ -77,10 +74,18 @@ typedef enum {
     MPIDI_CH3_PKT_RNDV_SEND,    /* FIXME: should be stream put */
     MPIDI_CH3_PKT_CANCEL_SEND_REQ,
     MPIDI_CH3_PKT_CANCEL_SEND_RESP,
-    MPIDI_CH3_PKT_PUT,  /* RMA Packets begin here */
+    /* RMA Packets begin here */
+    MPIDI_CH3_PKT_PUT,
     MPIDI_CH3_PKT_GET,
-    MPIDI_CH3_PKT_GET_RESP,
     MPIDI_CH3_PKT_ACCUMULATE,
+    MPIDI_CH3_PKT_GET_ACCUM,
+    MPIDI_CH3_PKT_ACCUM_IMMED,  /* optimization for short accumulate */
+    MPIDI_CH3_PKT_FOP,
+    MPIDI_CH3_PKT_CAS,
+    MPIDI_CH3_PKT_GET_RESP,
+    MPIDI_CH3_PKT_GET_ACCUM_RESP,
+    MPIDI_CH3_PKT_FOP_RESP,
+    MPIDI_CH3_PKT_CAS_RESP,
     MPIDI_CH3_PKT_LOCK,
     MPIDI_CH3_PKT_LOCK_GRANTED,
     MPIDI_CH3_PKT_UNLOCK,
@@ -91,14 +96,6 @@ typedef enum {
     MPIDI_CH3_PKT_LOCK_GET_UNLOCK,      /* optimization for single gets */
     MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK,    /* optimization for single accumulates */
     /* RMA Packets end here */
-    MPIDI_CH3_PKT_ACCUM_IMMED,  /* optimization for short accumulate */
-    /* FIXME: Add PUT, GET_IMMED packet types */
-    MPIDI_CH3_PKT_CAS,
-    MPIDI_CH3_PKT_CAS_RESP,
-    MPIDI_CH3_PKT_FOP,
-    MPIDI_CH3_PKT_FOP_RESP,
-    MPIDI_CH3_PKT_GET_ACCUM,
-    MPIDI_CH3_PKT_GET_ACCUM_RESP,
     MPIDI_CH3_PKT_FLOW_CNTL_UPDATE,     /* FIXME: Unused */
     MPIDI_CH3_PKT_CLOSE,
     MPIDI_CH3_PKT_REVOKE,

http://git.mpich.org/mpich.git/commitdiff/a42b916df8b9ee3b06cfdf05d252334e474abaa4

commit a42b916df8b9ee3b06cfdf05d252334e474abaa4
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sat Nov 1 19:07:57 2014 -0500

    Simplify PktHandler_FOP and PktHandler_FOPResp.
    
    For FOP operation, all data can be fit into the packet
    header, so on origin side we do not need to send separate
    data packets, and on target side we do not need request
    handler, only packet handler is needed. Similar with FOP
    response packet, we can receive all data in FOP resp packet
    handler. This patch delete the request handler on target
    side and simplify packet handler on target / origin side.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index 2e7647e..2755266 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -1909,8 +1909,6 @@ int MPIDI_CH3_ReqHandler_SinglePutAccumComplete( MPIDI_VC_t *, MPID_Request *,
 						 int * );
 int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *,
 						   MPID_Request *, int * );
-int MPIDI_CH3_ReqHandler_FOPComplete( MPIDI_VC_t *, MPID_Request *, int * );
-
 /* Send Handlers */
 int MPIDI_CH3_ReqHandler_SendReloadIOV( MPIDI_VC_t *vc, MPID_Request *sreq, 
 					int *complete );
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 67e97a1..bae961b 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -642,161 +642,6 @@ int MPIDI_CH3_ReqHandler_SinglePutAccumComplete( MPIDI_VC_t *vc,
 
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_ReqHandler_FOPComplete
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_FOPComplete( MPIDI_VC_t *vc, 
-                                      MPID_Request *rreq, int *complete )
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_CH3_Pkt_t upkt;
-    MPIDI_CH3_Pkt_fop_resp_t *fop_resp_pkt = &upkt.fop_resp;
-    MPID_Request *resp_req;
-    MPID_Win *win_ptr;
-    MPI_User_function *uop;
-    MPI_Aint len;
-    int one;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_FOPCOMPLETE);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_FOPCOMPLETE);
-    MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"performing FOP operation");
-
-    MPID_Datatype_get_size_macro(rreq->dev.datatype, len);
-
-    MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP);
-    fop_resp_pkt->request_handle = rreq->dev.request_handle;
-
-    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
-    fop_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
-    fop_resp_pkt->source_win_handle = rreq->dev.source_win_handle;
-    fop_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
-    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
-        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
-
-    /* Copy original data into the send buffer.  If data will fit in the
-       header, use that.  Otherwise allocate a temporary buffer.  */
-    if (len <= sizeof(fop_resp_pkt->data)) {
-        MPIU_Memcpy( fop_resp_pkt->data, rreq->dev.real_user_buf, len );
-    }
-    else {
-        resp_req = MPID_Request_create();
-        MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
-        MPIU_Object_set_ref(resp_req, 1);
-
-        resp_req->dev.target_win_handle = rreq->dev.target_win_handle;
-        resp_req->dev.flags = rreq->dev.flags;
-        resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumLikeSendComplete;
-
-        /* here we increment the Active Target counter to guarantee the GET-like
-           operation are completed when counter reaches zero. */
-        win_ptr->at_completion_counter++;
-
-        MPIDI_CH3U_SRBuf_alloc(resp_req, len);
-        MPIU_ERR_CHKANDJUMP(resp_req->dev.tmpbuf_sz < len, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
-        MPIU_Memcpy( resp_req->dev.tmpbuf, rreq->dev.real_user_buf, len );
-    }
-
-    /* Apply the op */
-    if (rreq->dev.op != MPI_NO_OP) {
-        uop = MPIR_OP_HDL_TO_FN(rreq->dev.op);
-        one = 1;
-
-        if (win_ptr->shm_allocated == TRUE)
-            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-        (*uop)(rreq->dev.user_buf, rreq->dev.real_user_buf, &one, &rreq->dev.datatype);
-        if (win_ptr->shm_allocated == TRUE)
-            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-    }
-
-    /* Send back the original data.  We do this here to ensure that the
-       operation is remote complete before responding to the origin. */
-    if (len <= sizeof(fop_resp_pkt->data)) {
-        MPIU_THREAD_CS_ENTER(CH3COMM,vc);
-        mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_resp_pkt, sizeof(*fop_resp_pkt), &resp_req);
-        MPIU_THREAD_CS_EXIT(CH3COMM,vc);
-        MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-
-        if (resp_req != NULL) {
-            if (!MPID_Request_is_complete(resp_req)) {
-                /* sending process is not completed, set proper OnDataAvail
-                   (it is initialized to NULL by lower layer) */
-                resp_req->dev.target_win_handle = rreq->dev.target_win_handle;
-                resp_req->dev.flags = rreq->dev.flags;
-                resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumLikeSendComplete;
-
-                /* here we increment the Active Target counter to guarantee the GET-like
-                   operation are completed when counter reaches zero. */
-                win_ptr->at_completion_counter++;
-
-                MPID_Request_release(resp_req);
-                goto finish_up;
-            }
-            else {
-                MPID_Request_release(resp_req);
-            }
-        }
-
-        if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER) {
-            win_ptr->at_completion_counter--;
-            MPIU_Assert(win_ptr->at_completion_counter >= 0);
-            /* Signal the local process when the op counter reaches 0. */
-            if (win_ptr->at_completion_counter == 0)
-                MPIDI_CH3_Progress_signal_completion();
-        }
-
-        /* There are additional steps to take if this is a passive
-           target RMA or the last operation from the source */
-        mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
-                                                   rreq->dev.source_win_handle);
-        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-    }
-    else {
-        MPID_IOV iov[MPID_IOV_LIMIT];
-
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) fop_resp_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*fop_resp_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)resp_req->dev.tmpbuf;
-        iov[1].MPID_IOV_LEN = len;
-
-        MPIU_THREAD_CS_ENTER(CH3COMM,vc);
-        mpi_errno = MPIDI_CH3_iSendv(vc, resp_req, iov, 2);
-        MPIU_THREAD_CS_EXIT(CH3COMM,vc);
-        MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-    }
-
- finish_up:
-    /* Free temporary buffer allocated in PktHandler_FOP */
-    if (len > sizeof(int) * MPIDI_RMA_FOP_IMMED_INTS && rreq->dev.op != MPI_NO_OP) {
-        MPIU_Free(rreq->dev.user_buf);
-        /* Assign user_buf to NULL so that reqHandler_GetAccumRespComplete()
-           will not try to free an empty buffer. */
-        rreq->dev.user_buf = NULL;
-    }
-    else {
-        /* FOP data fit in pkt header and user_buf just points to data area in pkt header
-           in pktHandler_FOP(), and it should be freed when pkt header is freed.
-           Here we assign user_buf to NULL so that reqHandler_GetAccumRespComplete()
-           will not try to free it. */
-        rreq->dev.user_buf = NULL;
-    }
-
-    *complete = 1;
-
- fn_exit:
-    MPID_Request_release(rreq);
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_FOPCOMPLETE);
-    return mpi_errno;
-    /* --BEGIN ERROR HANDLING-- */
- fn_fail:
-    if (resp_req != NULL) {
-        MPID_Request_release(resp_req);
-    }
-    goto fn_exit;
-    /* --END ERROR HANDLING-- */
-}
-
-
-#undef FUNCNAME
 #define FUNCNAME MPIDI_CH3_ReqHandler_UnpackUEBufComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 8daa90a..4712f4d 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -878,89 +878,90 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_CH3_Pkt_fop_t *fop_pkt = &pkt->fop;
-    MPID_Request *req;
-    MPID_Win *win_ptr;
-    int data_complete = 0;
-    MPI_Aint len;
-    MPIU_CHKPMEM_DECL(1);
+    MPIDI_CH3_Pkt_t upkt;
+    MPIDI_CH3_Pkt_fop_resp_t *fop_resp_pkt = &upkt.fop_resp;
+    MPID_Request *resp_req = NULL;
+    MPID_Win *win_ptr = NULL;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOP);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOP);
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received FOP pkt");
 
-    MPIU_Assert(fop_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(fop_pkt->target_win_handle, win_ptr);
-    mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, fop_pkt->flags);
-
-    req = MPID_Request_create();
-    MPIU_ERR_CHKANDJUMP(req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
-    MPIU_Object_set_ref(req, 1);        /* Ref is held by progress engine */
-    *rreqp = NULL;
-
-    req->dev.user_buf = NULL;   /* will be set later */
-    req->dev.user_count = 1;
-    req->dev.datatype = fop_pkt->datatype;
-    req->dev.op = fop_pkt->op;
-    req->dev.real_user_buf = fop_pkt->addr;
-    req->dev.target_win_handle = fop_pkt->target_win_handle;
-    req->dev.request_handle = fop_pkt->request_handle;
-    req->dev.flags = fop_pkt->flags;
-    /* fop_pkt->source_win_handle is set in MPIDI_Fetch_and_op,
-       here we pass it to receiving request, so that after receiving
-       is finished, we can pass it to sending back pkt. */
-    req->dev.source_win_handle = fop_pkt->source_win_handle;
 
-    MPID_Datatype_get_size_macro(req->dev.datatype, len);
-    MPIU_Assert(len <= sizeof(MPIDI_CH3_FOP_Immed_u));
+    (*buflen) = sizeof(MPIDI_CH3_Pkt_t);
+    (*rreqp) = NULL;
+
+    MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP);
+    fop_resp_pkt->request_handle = fop_pkt->request_handle;
+    fop_resp_pkt->source_win_handle = fop_pkt->source_win_handle;
+    fop_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
+    fop_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
+        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+    fop_resp_pkt->immed_len = fop_pkt->immed_len;
+
+    /* copy data to resp pkt header */
+    void *src = fop_pkt->addr, *dest = fop_resp_pkt->data;
+    mpi_errno = immed_copy(src, dest, fop_resp_pkt->immed_len);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-    /* Set up the user buffer and receive data if needed */
-    if (len <= sizeof(fop_pkt->origin_data) || fop_pkt->op == MPI_NO_OP) {
-        req->dev.user_buf = fop_pkt->origin_data;
-        *buflen = sizeof(MPIDI_CH3_Pkt_t);
-        data_complete = 1;
+    /* Apply the op */
+    if (fop_pkt->op != MPI_NO_OP) {
+        MPI_User_function *uop = MPIR_OP_HDL_TO_FN(fop_pkt->op);
+        int one = 1;
+        if (win_ptr->shm_allocated == TRUE)
+            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+        (*uop)(fop_pkt->data, fop_pkt->addr, &one, &(fop_pkt->datatype));
+        if (win_ptr->shm_allocated == TRUE)
+            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
     }
-    else {
-        /* Data won't fit in the header, allocate temp space and receive it */
-        MPIDI_msg_sz_t data_len;
-        void *data_buf;
 
-        data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
-        data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
-        req->dev.recv_data_sz = len;    /* count == 1 for FOP */
-
-        MPIU_CHKPMEM_MALLOC(req->dev.user_buf, void *, len, mpi_errno, "**nomemreq");
+    /* send back the original data */
+    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
+    mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_resp_pkt, sizeof(*fop_resp_pkt), &resp_req);
+    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
+    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
 
-        mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &data_complete);
-        MPIU_ERR_CHKANDJUMP1(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
-                             "**ch3|postrecv %s", "MPIDI_CH3_PKT_ACCUMULATE");
+    if (resp_req != NULL) {
+        if (!MPID_Request_is_complete(resp_req)) {
+            /* sending process is not completed, set proper OnDataAvail
+               (it is initialized to NULL by lower layer) */
+            resp_req->dev.target_win_handle = fop_pkt->target_win_handle;
+            resp_req->dev.flags = fop_pkt->flags;
+            resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumLikeSendComplete;
 
-        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_FOPComplete;
+            /* here we increment the Active Target counter to guarantee the GET-like
+               operation are completed when counter reaches zero. */
+            win_ptr->at_completion_counter++;
 
-        if (!data_complete) {
-            *rreqp = req;
+            MPID_Request_release(resp_req);
+            goto fn_exit;
+        }
+        else {
+            MPID_Request_release(resp_req);
         }
+    }
 
-        /* return the number of bytes processed in this function */
-        *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
+    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
+        mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        MPIDI_CH3_Progress_signal_completion();
     }
 
-    if (data_complete) {
-        int fop_complete = 0;
-        mpi_errno = MPIDI_CH3_ReqHandler_FOPComplete(vc, req, &fop_complete);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
-        *rreqp = NULL;
+    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER) {
+        win_ptr->at_completion_counter--;
+        MPIU_Assert(win_ptr->at_completion_counter >= 0);
+        if (win_ptr->at_completion_counter == 0)
+            MPIDI_CH3_Progress_signal_completion();
     }
 
   fn_exit:
-    MPIU_CHKPMEM_COMMIT();
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOP);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
-    MPIU_CHKPMEM_REAP();
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -976,10 +977,8 @@ int MPIDI_CH3_PktHandler_FOPResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_CH3_Pkt_fop_resp_t *fop_resp_pkt = &pkt->fop_resp;
-    MPID_Request *req;
-    int complete = 0;
-    MPI_Aint len;
-    MPID_Win *win_ptr;
+    MPID_Request *req = NULL;
+    MPID_Win *win_ptr = NULL;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOPRESP);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOPRESP);
@@ -988,6 +987,10 @@ int MPIDI_CH3_PktHandler_FOPResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
 
     MPID_Win_get_ptr(fop_resp_pkt->source_win_handle, win_ptr);
 
+    /* Copy data to result buffer on orgin */
+    MPID_Request_get_ptr(fop_resp_pkt->request_handle, req);
+    MPIU_Memcpy(req->dev.user_buf, fop_resp_pkt->data, fop_resp_pkt->immed_len);
+
     /* decrement ack_counter */
     if (fop_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
         int target_rank = fop_resp_pkt->target_rank;
@@ -995,36 +998,9 @@ int MPIDI_CH3_PktHandler_FOPResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     }
 
-    MPID_Request_get_ptr(fop_resp_pkt->request_handle, req);
-    MPID_Datatype_get_size_macro(req->dev.datatype, len);
-
-    if (len <= sizeof(fop_resp_pkt->data)) {
-        MPIU_Memcpy(req->dev.user_buf, (void *) fop_resp_pkt->data, len);
-        *buflen = sizeof(MPIDI_CH3_Pkt_t);
-        complete = 1;
-    }
-    else {
-        /* Data was too big to embed in the header */
-        MPIDI_msg_sz_t data_len;
-        void *data_buf;
-
-        data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
-        data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
-        req->dev.recv_data_sz = len;    /* count == 1 for FOP */
-        *rreqp = req;
-
-        mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
-        MPIU_ERR_CHKANDJUMP1(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER,
-                             "**ch3|postrecv", "**ch3|postrecv %s", "MPIDI_CH3_PKT_GET_RESP");
-
-        /* return the number of bytes processed in this function */
-        *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
-    }
-
-    if (complete) {
-        MPIDI_CH3U_Request_complete(req);
-        *rreqp = NULL;
-    }
+    MPIDI_CH3U_Request_complete(req);
+    *buflen = sizeof(MPIDI_CH3_Pkt_t);
+    *rreqp = NULL;
 
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOPRESP);

http://git.mpich.org/mpich.git/commitdiff/52c2fc112fe62c0280785c0c40f0885310875e41

commit 52c2fc112fe62c0280785c0c40f0885310875e41
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sat Nov 1 15:16:24 2014 -0500

    Simplify issuing functions at origin side.
    
    Here we extract the common code of different
    issuing functions at origin side and simplify
    those issuing functions.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
new file mode 100644
index 0000000..c310329
--- /dev/null
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -0,0 +1,874 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2014 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#if !defined(MPID_RMA_ISSUE_H_INCLUDED)
+#define MPID_RMA_ISSUE_H_INCLUDED
+
+#include "mpl_utlist.h"
+#include "mpid_rma_types.h"
+
+/* =========================================================== */
+/*                    auxiliary functions                      */
+/* =========================================================== */
+
+/* immed_copy() copys data from origin buffer to
+   IMMED packet header. */
+#undef FUNCNAME
+#define FUNCNAME immed_copy
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int immed_copy(void *src, void *dest, size_t len)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_IMMED_COPY);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_IMMED_COPY);
+
+    switch (len) {
+    case 1:
+        *(uint8_t *) dest = *(uint8_t *) src;
+        break;
+    case 2:
+        *(uint16_t *) dest = *(uint16_t *) src;
+        break;
+    case 4:
+        *(uint32_t *) dest = *(uint32_t *) src;
+        break;
+    case 8:
+        *(uint64_t *) dest = *(uint64_t *) src;
+        break;
+    default:
+        MPIU_Memcpy(dest, (void *) src, len);
+    }
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_IMMED_COPY);
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
+/* fill_in_derived_dtp_info() fills derived datatype information
+   into RMA operation structure. */
+#undef FUNCNAME
+#define FUNCNAME fill_in_derived_dtp_info
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int fill_in_derived_dtp_info(MPIDI_RMA_Op_t *rma_op, MPID_Datatype *dtp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIU_CHKPMEM_DECL(1);
+    MPIDI_STATE_DECL(MPID_STATE_FILL_IN_DERIVED_DTP_INFO);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_FILL_IN_DERIVED_DTP_INFO);
+
+    /* Derived datatype on target, fill derived datatype info. */
+    rma_op->dtype_info.is_contig = dtp->is_contig;
+    rma_op->dtype_info.max_contig_blocks = dtp->max_contig_blocks;
+    rma_op->dtype_info.size = dtp->size;
+    rma_op->dtype_info.extent = dtp->extent;
+    rma_op->dtype_info.dataloop_size = dtp->dataloop_size;
+    rma_op->dtype_info.dataloop_depth = dtp->dataloop_depth;
+    rma_op->dtype_info.eltype = dtp->eltype;
+    rma_op->dtype_info.dataloop = dtp->dataloop;
+    rma_op->dtype_info.ub = dtp->ub;
+    rma_op->dtype_info.lb = dtp->lb;
+    rma_op->dtype_info.true_ub = dtp->true_ub;
+    rma_op->dtype_info.true_lb = dtp->true_lb;
+    rma_op->dtype_info.has_sticky_ub = dtp->has_sticky_ub;
+    rma_op->dtype_info.has_sticky_lb = dtp->has_sticky_lb;
+
+    MPIU_CHKPMEM_MALLOC(rma_op->dataloop, void *, dtp->dataloop_size,
+                        mpi_errno, "dataloop");
+
+    MPIU_Memcpy(rma_op->dataloop, dtp->dataloop, dtp->dataloop_size);
+    /* The dataloop can have undefined padding sections, so we need to let
+     * valgrind know that it is OK to pass this data to writev later on. */
+    MPL_VG_MAKE_MEM_DEFINED(rma_op->dataloop, dtp->dataloop_size);
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_FILL_IN_DERIVED_DTP_INFO);
+    MPIU_CHKPMEM_COMMIT();
+    return mpi_errno;
+ fn_fail:
+    MPIU_CHKPMEM_REAP();
+    goto fn_exit;
+}
+
+
+/* create_datatype() creates a new struct datatype for the dtype_info
+   and the dataloop of the target datatype together with the user data */
+#undef FUNCNAME
+#define FUNCNAME create_datatype
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
+                           const void *dataloop, MPI_Aint dataloop_sz,
+                           const void *o_addr, int o_count, MPI_Datatype o_datatype,
+                           MPID_Datatype ** combined_dtp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    /* datatype_set_contents wants an array 'ints' which is the
+     * blocklens array with count prepended to it.  So blocklens
+     * points to the 2nd element of ints to avoid having to copy
+     * blocklens into ints later. */
+    int ints[4];
+    int *blocklens = &ints[1];
+    MPI_Aint displaces[3];
+    MPI_Datatype datatypes[3];
+    const int count = 3;
+    MPI_Datatype combined_datatype;
+    MPIDI_STATE_DECL(MPID_STATE_CREATE_DATATYPE);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_CREATE_DATATYPE);
+
+    /* create datatype */
+    displaces[0] = MPIU_PtrToAint(dtype_info);
+    blocklens[0] = sizeof(*dtype_info);
+    datatypes[0] = MPI_BYTE;
+
+    displaces[1] = MPIU_PtrToAint(dataloop);
+    MPIU_Assign_trunc(blocklens[1], dataloop_sz, int);
+    datatypes[1] = MPI_BYTE;
+
+    displaces[2] = MPIU_PtrToAint(o_addr);
+    blocklens[2] = o_count;
+    datatypes[2] = o_datatype;
+
+    mpi_errno = MPID_Type_struct(count, blocklens, displaces, datatypes, &combined_datatype);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+
+    ints[0] = count;
+
+    MPID_Datatype_get_ptr(combined_datatype, *combined_dtp);
+    mpi_errno = MPID_Datatype_set_contents(*combined_dtp, MPI_COMBINER_STRUCT,
+                                           count + 1,       /* ints (cnt,blklen) */
+                                           count,       /* aints (disps) */
+                                           count,       /* types */
+                                           ints, displaces, datatypes);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+
+    /* Commit datatype */
+
+    MPID_Dataloop_create(combined_datatype,
+                         &(*combined_dtp)->dataloop,
+                         &(*combined_dtp)->dataloop_size,
+                         &(*combined_dtp)->dataloop_depth, MPID_DATALOOP_HOMOGENEOUS);
+
+    /* create heterogeneous dataloop */
+    MPID_Dataloop_create(combined_datatype,
+                         &(*combined_dtp)->hetero_dloop,
+                         &(*combined_dtp)->hetero_dloop_size,
+                         &(*combined_dtp)->hetero_dloop_depth, MPID_DATALOOP_HETEROGENEOUS);
+
+  fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_CREATE_DATATYPE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+/* =========================================================== */
+/*                      issuinng functions                     */
+/* =========================================================== */
+
+/* issue_from_origin_buffer() issues data from origin
+   buffer (i.e. non-IMMED operation). */
+#undef FUNCNAME
+#define FUNCNAME issue_from_origin_buffer
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int issue_from_origin_buffer(MPIDI_RMA_Op_t *rma_op, MPID_IOV *iov, MPIDI_VC_t *vc)
+{
+    MPI_Aint origin_type_size;
+    MPI_Datatype target_datatype;
+    MPID_Datatype *target_dtp = NULL, *origin_dtp = NULL;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER);
+
+    /* Judge if target datatype is derived datatype. */
+    MPIDI_CH3_PKT_RMA_GET_TARGET_DATATYPE(rma_op->pkt, target_datatype, mpi_errno);
+    if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
+        MPID_Datatype_get_ptr(target_datatype, target_dtp);
+
+        /* Fill derived datatype info. */
+        mpi_errno = fill_in_derived_dtp_info(rma_op, target_dtp);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        /* Set dataloop size in pkt header */
+        MPIDI_CH3_PKT_RMA_SET_DATALOOP_SIZE(rma_op->pkt, target_dtp->dataloop_size, mpi_errno);
+    }
+
+    /* Judge if origin datatype is derived datatype. */
+    if (!MPIR_DATATYPE_IS_PREDEFINED(rma_op->origin_datatype)) {
+        MPID_Datatype_get_ptr(rma_op->origin_datatype, origin_dtp);
+    }
+
+    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
+
+    if (target_dtp == NULL) {
+        /* basic datatype on target */
+        if (origin_dtp == NULL) {
+            /* basic datatype on origin */
+            int iovcnt = 2;
+            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+            mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, iovcnt, &rma_op->request);
+            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+        }
+        else {
+            /* derived datatype on origin */
+            rma_op->request = MPID_Request_create();
+            MPIU_ERR_CHKANDJUMP(rma_op->request == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+
+            MPIU_Object_set_ref(rma_op->request, 2);
+            rma_op->request->kind = MPID_REQUEST_SEND;
+
+            rma_op->request->dev.segment_ptr = MPID_Segment_alloc();
+            MPIU_ERR_CHKANDJUMP1(rma_op->request->dev.segment_ptr == NULL, mpi_errno,
+                                 MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
+
+            rma_op->request->dev.datatype_ptr = origin_dtp;
+            /* this will cause the datatype to be freed when the request
+             * is freed. */
+            MPID_Segment_init(rma_op->origin_addr, rma_op->origin_count,
+                              rma_op->origin_datatype, rma_op->request->dev.segment_ptr, 0);
+            rma_op->request->dev.segment_first = 0;
+            rma_op->request->dev.segment_size = rma_op->origin_count * origin_type_size;
+
+            rma_op->request->dev.OnFinal = 0;
+            rma_op->request->dev.OnDataAvail = 0;
+
+            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+            mpi_errno = vc->sendNoncontig_fn(vc, rma_op->request,
+                                             iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
+            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+        }
+    }
+    else {
+        /* derived datatype on target */
+        MPID_Datatype *combined_dtp = NULL;
+
+        rma_op->request = MPID_Request_create();
+        if (rma_op->request == NULL) {
+            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+        }
+
+        MPIU_Object_set_ref(rma_op->request, 2);
+        rma_op->request->kind = MPID_REQUEST_SEND;
+
+        rma_op->request->dev.segment_ptr = MPID_Segment_alloc();
+        MPIU_ERR_CHKANDJUMP1(rma_op->request->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER,
+                             "**nomem", "**nomem %s", "MPID_Segment_alloc");
+
+        /* create a new datatype containing the dtype_info, dataloop, and origin data */
+
+        mpi_errno = create_datatype(&rma_op->dtype_info, rma_op->dataloop,
+                                    target_dtp->dataloop_size,
+                                    rma_op->origin_addr, rma_op->origin_count,
+                                    rma_op->origin_datatype,
+                                    &combined_dtp);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
+        rma_op->request->dev.datatype_ptr = combined_dtp;
+        /* combined_datatype will be freed when request is freed */
+
+        MPID_Segment_init(MPI_BOTTOM, 1, combined_dtp->handle, rma_op->request->dev.segment_ptr, 0);
+        rma_op->request->dev.segment_first = 0;
+        rma_op->request->dev.segment_size = combined_dtp->size;
+
+        rma_op->request->dev.OnFinal = 0;
+        rma_op->request->dev.OnDataAvail = 0;
+
+        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+        mpi_errno = vc->sendNoncontig_fn(vc, rma_op->request,
+                                         iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
+        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+        /* we're done with the datatypes */
+        if (origin_dtp != NULL)
+            MPID_Datatype_release(origin_dtp);
+        MPID_Datatype_release(target_dtp);
+    }
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER);
+    return mpi_errno;
+ fn_fail:
+    if (rma_op->request) {
+        if (rma_op->request->dev.datatype_ptr)
+            MPID_Datatype_release(rma_op->request->dev.datatype_ptr);
+        MPID_Request_release(rma_op->request);
+    }
+    rma_op->request = NULL;
+    goto fn_exit;
+}
+
+
+/* issue_put_op() issues PUT packet header and data. */
+#undef FUNCNAME
+#define FUNCNAME issue_put_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win *win_ptr,
+                        MPIDI_RMA_Target_t *target_ptr,
+                        MPIDI_CH3_Pkt_flags_t flags)
+{
+    MPIDI_VC_t *vc = NULL;
+    size_t len;
+    MPI_Aint origin_type_size;
+    MPID_Comm *comm_ptr = win_ptr->comm_ptr;
+    MPIDI_CH3_Pkt_put_t *put_pkt = &rma_op->pkt.put;
+    MPID_IOV iov[MPID_IOV_LIMIT];
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_ISSUE_PUT_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_PUT_OP);
+
+    rma_op->request = NULL;
+
+    put_pkt->flags = flags;
+    if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
+        put_pkt->lock_type = target_ptr->lock_type;
+
+    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
+
+    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
+    MPIU_Assign_trunc(len, rma_op->origin_count * origin_type_size, size_t);
+
+    if (len == put_pkt->immed_len) {
+        /* All origin data is in packet header, issue the header. */
+        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+        mpi_errno = MPIDI_CH3_iStartMsg(vc, put_pkt, sizeof(*put_pkt), &(rma_op->request));
+        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+    }
+    else {
+        /* We still need to issue from origin buffer. */
+        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) put_pkt;
+        iov[0].MPID_IOV_LEN = sizeof(*put_pkt);
+        if (!rma_op->is_dt) {
+            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)rma_op->origin_addr + put_pkt->immed_len);
+            iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size - put_pkt->immed_len;
+        }
+
+        mpi_errno = issue_from_origin_buffer(rma_op, iov, vc);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+
+  fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_PUT_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+  fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
+/* issue_acc_op() send ACC packet header and data. */
+#undef FUNCNAME
+#define FUNCNAME issue_acc_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int issue_acc_op(MPIDI_RMA_Op_t *rma_op, MPID_Win *win_ptr,
+                        MPIDI_RMA_Target_t *target_ptr,
+                        MPIDI_CH3_Pkt_flags_t flags)
+{
+    MPIDI_VC_t *vc = NULL;
+    size_t len;
+    MPI_Aint origin_type_size;
+    MPID_Comm *comm_ptr = win_ptr->comm_ptr;
+    MPIDI_CH3_Pkt_accum_t *accum_pkt = &rma_op->pkt.accum;
+    MPID_IOV iov[MPID_IOV_LIMIT];
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_ISSUE_ACC_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_ACC_OP);
+
+    rma_op->request = NULL;
+
+    accum_pkt->flags = flags;
+    if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
+        accum_pkt->lock_type = target_ptr->lock_type;
+
+    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
+
+    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
+    MPIU_Assign_trunc(len, rma_op->origin_count * origin_type_size, size_t);
+
+    if (len == accum_pkt->immed_len) {
+        /* All origin data is in packet header, issue the header. */
+        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+        mpi_errno = MPIDI_CH3_iStartMsg(vc, accum_pkt, sizeof(*accum_pkt), &(rma_op->request));
+        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+    }
+    else {
+        /* We still need to issue from origin buffer. */
+        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) accum_pkt;
+        iov[0].MPID_IOV_LEN = sizeof(*accum_pkt);
+        if (!rma_op->is_dt) {
+            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)rma_op->origin_addr + accum_pkt->immed_len);
+            iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size - accum_pkt->immed_len;
+        }
+
+        mpi_errno = issue_from_origin_buffer(rma_op, iov, vc);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+
+ fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_ACC_OP);
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
+/* issue_get_acc_op() send GACC packet header and data. */
+#undef FUNCNAME
+#define FUNCNAME issue_get_acc_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int issue_get_acc_op(MPIDI_RMA_Op_t *rma_op, MPID_Win *win_ptr,
+                            MPIDI_RMA_Target_t *target_ptr,
+                            MPIDI_CH3_Pkt_flags_t flags)
+{
+    MPIDI_VC_t *vc = NULL;
+    size_t len;
+    MPI_Aint origin_type_size;
+    MPID_Comm *comm_ptr = win_ptr->comm_ptr;
+    MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &rma_op->pkt.get_accum;
+    MPID_IOV iov[MPID_IOV_LIMIT];
+    MPID_Request *resp_req = NULL;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_ISSUE_GET_ACC_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_GET_ACC_OP);
+
+    rma_op->request = NULL;
+
+    /* Create a request for the GACC response.  Store the response buf, count, and
+     * datatype in it, and pass the request's handle in the GACC packet. When the
+     * response comes from the target, it will contain the request handle. */
+    resp_req = MPID_Request_create();
+    MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+
+    MPIU_Object_set_ref(resp_req, 2);
+
+    resp_req->dev.user_buf = rma_op->result_addr;
+    resp_req->dev.user_count = rma_op->result_count;
+    resp_req->dev.datatype = rma_op->result_datatype;
+    resp_req->dev.target_win_handle = get_accum_pkt->target_win_handle;
+    resp_req->dev.source_win_handle = get_accum_pkt->source_win_handle;
+
+    if (!MPIR_DATATYPE_IS_PREDEFINED(resp_req->dev.datatype)) {
+      MPID_Datatype *result_dtp = NULL;
+      MPID_Datatype_get_ptr(resp_req->dev.datatype, result_dtp);
+      resp_req->dev.datatype_ptr = result_dtp;
+      /* this will cause the datatype to be freed when the
+       * request is freed. */
+    }
+
+    /* Note: Get_accumulate uses the same packet type as accumulate */
+    get_accum_pkt->request_handle = resp_req->handle;
+
+    get_accum_pkt->flags = flags;
+    if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
+        get_accum_pkt->lock_type = target_ptr->lock_type;
+
+    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
+
+    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
+    MPIU_Assign_trunc(len, rma_op->origin_count * origin_type_size, size_t);
+
+    if (len == get_accum_pkt->immed_len) {
+        /* All origin data is in packet header, issue the header. */
+        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+        mpi_errno = MPIDI_CH3_iStartMsg(vc, get_accum_pkt, sizeof(*get_accum_pkt), &(rma_op->request));
+        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+    }
+    else {
+        /* We still need to issue from origin buffer. */
+        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_pkt;
+        iov[0].MPID_IOV_LEN = sizeof(*get_accum_pkt);
+        if (!rma_op->is_dt) {
+            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)rma_op->origin_addr + get_accum_pkt->immed_len);
+            iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size - get_accum_pkt->immed_len;
+        }
+
+        mpi_errno = issue_from_origin_buffer(rma_op, iov, vc);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+
+    /* This operation can generate two requests; one for inbound and one for
+     * outbound data. */
+    if (rma_op->request != NULL) {
+        /* If we have both inbound and outbound requests (i.e. GACC
+         * operation), we need to ensure that the source buffer is
+         * available and that the response data has been received before
+         * informing the origin that this operation is complete.  Because
+         * the update needs to be done atomically at the target, they will
+         * not send back data until it has been received.  Therefore,
+         * completion of the response request implies that the send request
+         * has completed.
+         *
+         * Therefore: refs on the response request are set to two: one is
+         * held by the progress engine and the other by the RMA op
+         * completion code.  Refs on the outbound request are set to one;
+         * it will be completed by the progress engine.
+         */
+
+        MPID_Request_release(rma_op->request);
+        rma_op->request = resp_req;
+
+    }
+    else {
+        rma_op->request = resp_req;
+    }
+
+    /* For error checking */
+    resp_req = NULL;
+
+ fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_GET_ACC_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+ fn_fail:
+    if (resp_req != NULL) {
+        MPID_Request_release(resp_req);
+    }
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME issue_get_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
+                        MPIDI_RMA_Target_t *target_ptr,
+                        MPIDI_CH3_Pkt_flags_t flags)
+{
+    MPIDI_CH3_Pkt_get_t *get_pkt = &rma_op->pkt.get;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_VC_t *vc;
+    MPID_Comm *comm_ptr;
+    MPID_Datatype *dtp;
+    MPI_Datatype target_datatype;
+    MPID_Request *req = NULL;
+    MPID_IOV iov[MPID_IOV_LIMIT];
+    MPIDI_STATE_DECL(MPID_STATE_ISSUE_GET_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_GET_OP);
+
+    /* create a request, store the origin buf, cnt, datatype in it,
+     * and pass a handle to it in the get packet. When the get
+     * response comes from the target, it will contain the request
+     * handle. */
+    rma_op->request = MPID_Request_create();
+    if (rma_op->request == NULL) {
+        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+    }
+
+    MPIU_Object_set_ref(rma_op->request, 2);
+
+    rma_op->request->dev.user_buf = rma_op->origin_addr;
+    rma_op->request->dev.user_count = rma_op->origin_count;
+    rma_op->request->dev.datatype = rma_op->origin_datatype;
+    rma_op->request->dev.target_win_handle = MPI_WIN_NULL;
+    rma_op->request->dev.source_win_handle = get_pkt->source_win_handle;
+    if (!MPIR_DATATYPE_IS_PREDEFINED(rma_op->request->dev.datatype)) {
+        MPID_Datatype_get_ptr(rma_op->request->dev.datatype, dtp);
+        rma_op->request->dev.datatype_ptr = dtp;
+        /* this will cause the datatype to be freed when the
+         * request is freed. */
+    }
+
+    get_pkt->request_handle = rma_op->request->handle;
+    get_pkt->flags = flags;
+    if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
+        get_pkt->lock_type = target_ptr->lock_type;
+
+    comm_ptr = win_ptr->comm_ptr;
+    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
+
+    MPIDI_CH3_PKT_RMA_GET_TARGET_DATATYPE(rma_op->pkt, target_datatype, mpi_errno);
+    if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
+        /* basic datatype on target. simply send the get_pkt. */
+        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+        mpi_errno = MPIDI_CH3_iStartMsg(vc, get_pkt, sizeof(*get_pkt), &req);
+        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+    }
+    else {
+        /* derived datatype on target. fill derived datatype info and
+         * send it along with get_pkt. */
+        MPID_Datatype_get_ptr(target_datatype, dtp);
+
+        mpi_errno = fill_in_derived_dtp_info(rma_op, dtp);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        /* Set dataloop size in pkt header */
+        MPIDI_CH3_PKT_RMA_SET_DATALOOP_SIZE(rma_op->pkt, dtp->dataloop_size, mpi_errno);
+
+        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_pkt;
+        iov[0].MPID_IOV_LEN = sizeof(*get_pkt);
+        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) & rma_op->dtype_info;
+        iov[1].MPID_IOV_LEN = sizeof(rma_op->dtype_info);
+        iov[2].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) rma_op->dataloop;
+        iov[2].MPID_IOV_LEN = dtp->dataloop_size;
+
+        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+        mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, 3, &req);
+        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+
+        /* release the target datatype */
+        MPID_Datatype_release(dtp);
+    }
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+    }
+
+    /* release the request returned by iStartMsg or iStartMsgv */
+    if (req != NULL) {
+        MPID_Request_release(req);
+    }
+
+  fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_GET_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+  fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME issue_cas_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int issue_cas_op(MPIDI_RMA_Op_t * rma_op,
+                        MPID_Win * win_ptr, MPIDI_RMA_Target_t *target_ptr,
+                        MPIDI_CH3_Pkt_flags_t flags)
+{
+    MPI_Aint len;
+    MPIDI_VC_t *vc = NULL;
+    MPID_Comm *comm_ptr = win_ptr->comm_ptr;
+    MPIDI_CH3_Pkt_cas_t *cas_pkt = &rma_op->pkt.cas;
+    MPID_Request *rmw_req = NULL;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_ISSUE_CAS_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_CAS_OP);
+
+    /* Create a request for the RMW response.  Store the origin buf, count, and
+     * datatype in it, and pass the request's handle RMW packet. When the
+     * response comes from the target, it will contain the request handle. */
+    rma_op->request = MPID_Request_create();
+    MPIU_ERR_CHKANDJUMP(rma_op->request == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+
+    /* Set refs on the request to 2: one for the response message, and one for
+     * the partial completion handler */
+    MPIU_Object_set_ref(rma_op->request, 2);
+
+    rma_op->request->dev.user_buf = rma_op->result_addr;
+    rma_op->request->dev.user_count = rma_op->result_count;
+    rma_op->request->dev.datatype = rma_op->result_datatype;
+
+    /* REQUIRE: All datatype arguments must be of the same, builtin
+     * type and counts must be 1. */
+    MPID_Datatype_get_size_macro(rma_op->origin_datatype, len);
+    MPIU_Assert(len <= sizeof(MPIDI_CH3_CAS_Immed_u));
+
+    rma_op->request->dev.target_win_handle = cas_pkt->target_win_handle;
+    rma_op->request->dev.source_win_handle = cas_pkt->source_win_handle;
+
+    cas_pkt->request_handle = rma_op->request->handle;
+    cas_pkt->flags = flags;
+    if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
+        cas_pkt->lock_type = target_ptr->lock_type;
+
+    MPIU_Memcpy((void *) &cas_pkt->origin_data, rma_op->origin_addr, len);
+    MPIU_Memcpy((void *) &cas_pkt->compare_data, rma_op->compare_addr, len);
+
+    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
+    MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+    mpi_errno = MPIDI_CH3_iStartMsg(vc, cas_pkt, sizeof(*cas_pkt), &rmw_req);
+    MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+    MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+    if (rmw_req != NULL) {
+        MPID_Request_release(rmw_req);
+    }
+
+  fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_CAS_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+  fn_fail:
+    if (rma_op->request) {
+        MPID_Request_release(rma_op->request);
+    }
+    rma_op->request = NULL;
+    if (rmw_req) {
+        MPID_Request_release(rmw_req);
+    }
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME issue_fop_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
+                        MPID_Win * win_ptr, MPIDI_RMA_Target_t *target_ptr,
+                        MPIDI_CH3_Pkt_flags_t flags)
+{
+    MPIDI_VC_t *vc = NULL;
+    MPID_Comm *comm_ptr = win_ptr->comm_ptr;
+    MPIDI_CH3_Pkt_fop_t *fop_pkt = &rma_op->pkt.fop;
+    MPID_Request *resp_req = NULL;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_ISSUE_FOP_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_FOP_OP);
+
+    rma_op->request = NULL;
+
+    /* Create a request for the GACC response.  Store the response buf, count, and
+     * datatype in it, and pass the request's handle in the GACC packet. When the
+     * response comes from the target, it will contain the request handle. */
+    resp_req = MPID_Request_create();
+    MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+
+    MPIU_Object_set_ref(resp_req, 2);
+
+    resp_req->dev.user_buf = rma_op->result_addr;
+    resp_req->dev.user_count = rma_op->result_count;
+    resp_req->dev.datatype = rma_op->result_datatype;
+    resp_req->dev.target_win_handle = fop_pkt->target_win_handle;
+    resp_req->dev.source_win_handle = fop_pkt->source_win_handle;
+
+    fop_pkt->request_handle = resp_req->handle;
+
+    fop_pkt->flags = flags;
+    if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK)
+        fop_pkt->lock_type = target_ptr->lock_type;
+
+    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
+
+    /* All origin data is in packet header, issue the header. */
+    MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+    mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_pkt, sizeof(*fop_pkt), &(rma_op->request));
+    MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+    MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+    /* This operation can generate two requests; one for inbound and one for
+     * outbound data. */
+    if (rma_op->request != NULL) {
+        /* If we have both inbound and outbound requests (i.e. GACC
+         * operation), we need to ensure that the source buffer is
+         * available and that the response data has been received before
+         * informing the origin that this operation is complete.  Because
+         * the update needs to be done atomically at the target, they will
+         * not send back data until it has been received.  Therefore,
+         * completion of the response request implies that the send request
+         * has completed.
+         *
+         * Therefore: refs on the response request are set to two: one is
+         * held by the progress engine and the other by the RMA op
+         * completion code.  Refs on the outbound request are set to one;
+         * it will be completed by the progress engine.
+         */
+
+        MPID_Request_release(rma_op->request);
+        rma_op->request = resp_req;
+    }
+    else {
+        rma_op->request = resp_req;
+    }
+
+    /* For error checking */
+    resp_req = NULL;
+
+ fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_FOP_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+ fn_fail:
+    if (resp_req != NULL) {
+        MPID_Request_release(resp_req);
+    }
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
+/* issue_rma_op() is called by ch3u_rma_oplist.c, it triggers
+   proper issuing functions according to packet type. */
+#undef FUNCNAME
+#define FUNCNAME issue_rma_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int issue_rma_op(MPIDI_RMA_Op_t * op_ptr, MPID_Win * win_ptr,
+                               MPIDI_RMA_Target_t * target_ptr,
+                               MPIDI_CH3_Pkt_flags_t flags)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_ISSUE_RMA_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_RMA_OP);
+
+    switch (op_ptr->pkt.type) {
+    case (MPIDI_CH3_PKT_PUT):
+        mpi_errno = issue_put_op(op_ptr, win_ptr, target_ptr, flags);
+        break;
+    case (MPIDI_CH3_PKT_ACCUMULATE):
+        mpi_errno = issue_acc_op(op_ptr, win_ptr, target_ptr, flags);
+        break;
+    case (MPIDI_CH3_PKT_GET_ACCUM):
+        mpi_errno = issue_get_acc_op(op_ptr, win_ptr, target_ptr, flags);
+        break;
+    case (MPIDI_CH3_PKT_GET):
+        mpi_errno = issue_get_op(op_ptr, win_ptr, target_ptr, flags);
+        break;
+    case (MPIDI_CH3_PKT_CAS):
+        mpi_errno = issue_cas_op(op_ptr, win_ptr, target_ptr, flags);
+        break;
+    case (MPIDI_CH3_PKT_FOP):
+        mpi_errno = issue_fop_op(op_ptr, win_ptr, target_ptr, flags);
+        break;
+    default:
+        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winInvalidOp");
+    }
+
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_RMA_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+  fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+#endif  /* MPID_RMA_ISSUE_H_INCLUDED */
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index 10db50b..9a016e4 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -10,6 +10,7 @@
 #include "mpid_rma_types.h"
 #include "mpid_rma_oplist.h"
 #include "mpid_rma_shm.h"
+#include "mpid_rma_issue.h"
 
 int MPIDI_CH3I_Issue_rma_op(MPIDI_RMA_Op_t * op_ptr, MPID_Win * win_ptr,
                             MPIDI_CH3_Pkt_flags_t flags);
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index 9b63cec..fab643a 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -42,61 +42,10 @@ cvars:
 === END_MPI_T_CVAR_INFO_BLOCK ===
 */
 
-static inline int issue_rma_op(MPIDI_RMA_Op_t * op_ptr, MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags);
 static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *target, int *made_progress);
 static inline int issue_ops_win(MPID_Win * win_ptr, int *made_progress);
 
 static int send_flush_msg(int dest, MPID_Win *win_ptr);
-static int send_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags);
-static int send_contig_acc_msg(MPIDI_RMA_Op_t * rma_op,
-                               MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags);
-static int recv_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags);
-static int send_immed_rmw_msg(MPIDI_RMA_Op_t * rma_op,
-                              MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags);
-
-#undef FUNCNAME
-#define FUNCNAME issue_rma_op
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int issue_rma_op(MPIDI_RMA_Op_t * op_ptr, MPID_Win * win_ptr,
-                               MPIDI_CH3_Pkt_flags_t flags)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_STATE_DECL(MPID_STATE_ISSUE_RMA_OP);
-
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_RMA_OP);
-
-    switch (op_ptr->pkt.type) {
-    case (MPIDI_CH3_PKT_PUT):
-    case (MPIDI_CH3_PKT_ACCUMULATE):
-    case (MPIDI_CH3_PKT_GET_ACCUM):
-        mpi_errno = send_rma_msg(op_ptr, win_ptr, flags);
-        break;
-    case (MPIDI_CH3_PKT_ACCUM_IMMED):
-        mpi_errno = send_contig_acc_msg(op_ptr, win_ptr, flags);
-        break;
-    case (MPIDI_CH3_PKT_GET):
-        mpi_errno = recv_rma_msg(op_ptr, win_ptr, flags);
-        break;
-    case (MPIDI_CH3_PKT_CAS):
-    case (MPIDI_CH3_PKT_FOP):
-        mpi_errno = send_immed_rmw_msg(op_ptr, win_ptr, flags);
-        break;
-        default:
-            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winInvalidOp");
-        }
-
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
-
-  fn_exit:
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_RMA_OP);
-    return mpi_errno;
-    /* --BEGIN ERROR HANDLING-- */
-  fn_fail:
-    goto fn_exit;
-    /* --END ERROR HANDLING-- */
-}
 
 #undef FUNCNAME
 #define FUNCNAME check_window_state
@@ -801,718 +750,6 @@ int MPIDI_CH3I_RMA_Make_progress_global(int *made_progress)
 }
 
 
-/* create_datatype() creates a new struct datatype for the dtype_info
-   and the dataloop of the target datatype together with the user data */
-#undef FUNCNAME
-#define FUNCNAME create_datatype
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
-                           const void *dataloop, MPI_Aint dataloop_sz,
-                           const void *o_addr, int o_count, MPI_Datatype o_datatype,
-                           MPID_Datatype ** combined_dtp)
-{
-    int mpi_errno = MPI_SUCCESS;
-    /* datatype_set_contents wants an array 'ints' which is the
-     * blocklens array with count prepended to it.  So blocklens
-     * points to the 2nd element of ints to avoid having to copy
-     * blocklens into ints later. */
-    int ints[4];
-    int *blocklens = &ints[1];
-    MPI_Aint displaces[3];
-    MPI_Datatype datatypes[3];
-    const int count = 3;
-    MPI_Datatype combined_datatype;
-    MPIDI_STATE_DECL(MPID_STATE_CREATE_DATATYPE);
-
-    MPIDI_FUNC_ENTER(MPID_STATE_CREATE_DATATYPE);
-
-    /* create datatype */
-    displaces[0] = MPIU_PtrToAint(dtype_info);
-    blocklens[0] = sizeof(*dtype_info);
-    datatypes[0] = MPI_BYTE;
-
-    displaces[1] = MPIU_PtrToAint(dataloop);
-    MPIU_Assign_trunc(blocklens[1], dataloop_sz, int);
-    datatypes[1] = MPI_BYTE;
-
-    displaces[2] = MPIU_PtrToAint(o_addr);
-    blocklens[2] = o_count;
-    datatypes[2] = o_datatype;
-
-    mpi_errno = MPID_Type_struct(count, blocklens, displaces, datatypes, &combined_datatype);
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
-
-    ints[0] = count;
-
-    MPID_Datatype_get_ptr(combined_datatype, *combined_dtp);
-    mpi_errno = MPID_Datatype_set_contents(*combined_dtp, MPI_COMBINER_STRUCT, count + 1,       /* ints (cnt,blklen) */
-                                           count,       /* aints (disps) */
-                                           count,       /* types */
-                                           ints, displaces, datatypes);
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
-
-    /* Commit datatype */
-
-    MPID_Dataloop_create(combined_datatype,
-                         &(*combined_dtp)->dataloop,
-                         &(*combined_dtp)->dataloop_size,
-                         &(*combined_dtp)->dataloop_depth, MPID_DATALOOP_HOMOGENEOUS);
-
-    /* create heterogeneous dataloop */
-    MPID_Dataloop_create(combined_datatype,
-                         &(*combined_dtp)->hetero_dloop,
-                         &(*combined_dtp)->hetero_dloop_size,
-                         &(*combined_dtp)->hetero_dloop_depth, MPID_DATALOOP_HETEROGENEOUS);
-
-  fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_CREATE_DATATYPE);
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}
-
-
-#undef FUNCNAME
-#define FUNCNAME send_rma_msg
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int send_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags)
-{
-    MPIDI_CH3_Pkt_put_t *put_pkt = &rma_op->pkt.put;
-    MPIDI_CH3_Pkt_accum_t *accum_pkt = &rma_op->pkt.accum;
-    MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &rma_op->pkt.get_accum;
-    MPID_IOV iov[MPID_IOV_LIMIT];
-    int mpi_errno = MPI_SUCCESS;
-    int origin_dt_derived, target_dt_derived, iovcnt;
-    MPI_Aint origin_type_size;
-    MPIDI_VC_t *vc;
-    MPID_Comm *comm_ptr;
-    MPID_Datatype *target_dtp = NULL, *origin_dtp = NULL;
-    MPI_Datatype target_datatype;
-    MPID_Request *resp_req = NULL;
-    MPIU_CHKPMEM_DECL(1);
-    MPIDI_STATE_DECL(MPID_STATE_SEND_RMA_MSG);
-    MPIDI_STATE_DECL(MPID_STATE_MEMCPY);
-
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_SEND_RMA_MSG);
-
-    rma_op->request = NULL;
-
-    if (rma_op->pkt.type == MPIDI_CH3_PKT_PUT) {
-        put_pkt->flags = flags;
-
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) put_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*put_pkt);
-    }
-    else if (rma_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM) {
-        /* Create a request for the GACC response.  Store the response buf, count, and
-         * datatype in it, and pass the request's handle in the GACC packet. When the
-         * response comes from the target, it will contain the request handle. */
-        resp_req = MPID_Request_create();
-        MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
-
-        MPIU_Object_set_ref(resp_req, 2);
-
-        resp_req->dev.user_buf = rma_op->result_addr;
-        resp_req->dev.user_count = rma_op->result_count;
-        resp_req->dev.datatype = rma_op->result_datatype;
-        resp_req->dev.target_win_handle = get_accum_pkt->target_win_handle;
-        resp_req->dev.source_win_handle = get_accum_pkt->source_win_handle;
-
-        if (!MPIR_DATATYPE_IS_PREDEFINED(resp_req->dev.datatype)) {
-            MPID_Datatype *result_dtp = NULL;
-            MPID_Datatype_get_ptr(resp_req->dev.datatype, result_dtp);
-            resp_req->dev.datatype_ptr = result_dtp;
-            /* this will cause the datatype to be freed when the
-             * request is freed. */
-        }
-
-        get_accum_pkt->request_handle = resp_req->handle;
-
-        get_accum_pkt->flags = flags;
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*get_accum_pkt);
-    }
-    else {
-        accum_pkt->flags = flags;
-
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) accum_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*accum_pkt);
-    }
-
-    /*    printf("send pkt: type %d, addr %d, count %d, base %d\n", rma_pkt->type,
-     * rma_pkt->addr, rma_pkt->count, win_ptr->base_addrs[rma_op->target_rank]);
-     * fflush(stdout);
-     */
-
-    comm_ptr = win_ptr->comm_ptr;
-    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
-
-    if (!MPIR_DATATYPE_IS_PREDEFINED(rma_op->origin_datatype)) {
-        origin_dt_derived = 1;
-        MPID_Datatype_get_ptr(rma_op->origin_datatype, origin_dtp);
-    }
-    else {
-        origin_dt_derived = 0;
-    }
-
-    MPIDI_CH3_PKT_RMA_GET_TARGET_DATATYPE(rma_op->pkt, target_datatype, mpi_errno);
-    if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
-        target_dt_derived = 1;
-        MPID_Datatype_get_ptr(target_datatype, target_dtp);
-    }
-    else {
-        target_dt_derived = 0;
-    }
-
-    if (target_dt_derived) {
-        /* derived datatype on target. fill derived datatype info */
-        rma_op->dtype_info.is_contig = target_dtp->is_contig;
-        rma_op->dtype_info.max_contig_blocks = target_dtp->max_contig_blocks;
-        rma_op->dtype_info.size = target_dtp->size;
-        rma_op->dtype_info.extent = target_dtp->extent;
-        rma_op->dtype_info.dataloop_size = target_dtp->dataloop_size;
-        rma_op->dtype_info.dataloop_depth = target_dtp->dataloop_depth;
-        rma_op->dtype_info.eltype = target_dtp->eltype;
-        rma_op->dtype_info.dataloop = target_dtp->dataloop;
-        rma_op->dtype_info.ub = target_dtp->ub;
-        rma_op->dtype_info.lb = target_dtp->lb;
-        rma_op->dtype_info.true_ub = target_dtp->true_ub;
-        rma_op->dtype_info.true_lb = target_dtp->true_lb;
-        rma_op->dtype_info.has_sticky_ub = target_dtp->has_sticky_ub;
-        rma_op->dtype_info.has_sticky_lb = target_dtp->has_sticky_lb;
-
-        MPIU_CHKPMEM_MALLOC(rma_op->dataloop, void *, target_dtp->dataloop_size,
-                            mpi_errno, "dataloop");
-
-        MPIDI_FUNC_ENTER(MPID_STATE_MEMCPY);
-        MPIU_Memcpy(rma_op->dataloop, target_dtp->dataloop, target_dtp->dataloop_size);
-        MPIDI_FUNC_EXIT(MPID_STATE_MEMCPY);
-        /* the dataloop can have undefined padding sections, so we need to let
-         * valgrind know that it is OK to pass this data to writev later on */
-        MPL_VG_MAKE_MEM_DEFINED(rma_op->dataloop, target_dtp->dataloop_size);
-
-        if (rma_op->pkt.type == MPIDI_CH3_PKT_PUT) {
-            put_pkt->dataloop_size = target_dtp->dataloop_size;
-        }
-        else if (rma_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE) {
-            accum_pkt->dataloop_size = target_dtp->dataloop_size;
-        }
-        else {
-            get_accum_pkt->dataloop_size = target_dtp->dataloop_size;
-        }
-    }
-
-    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
-
-    if (!target_dt_derived) {
-        /* basic datatype on target */
-        if (!origin_dt_derived) {
-            /* basic datatype on origin */
-            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) rma_op->origin_addr;
-            iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
-            iovcnt = 2;
-            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-            mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, iovcnt, &rma_op->request);
-            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-        }
-        else {
-            /* derived datatype on origin */
-            rma_op->request = MPID_Request_create();
-            MPIU_ERR_CHKANDJUMP(rma_op->request == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
-
-            MPIU_Object_set_ref(rma_op->request, 2);
-            rma_op->request->kind = MPID_REQUEST_SEND;
-
-            rma_op->request->dev.segment_ptr = MPID_Segment_alloc();
-            MPIU_ERR_CHKANDJUMP1(rma_op->request->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER,
-                                 "**nomem", "**nomem %s", "MPID_Segment_alloc");
-
-            rma_op->request->dev.datatype_ptr = origin_dtp;
-            /* this will cause the datatype to be freed when the request
-             * is freed. */
-            MPID_Segment_init(rma_op->origin_addr, rma_op->origin_count,
-                              rma_op->origin_datatype, rma_op->request->dev.segment_ptr, 0);
-            rma_op->request->dev.segment_first = 0;
-            rma_op->request->dev.segment_size = rma_op->origin_count * origin_type_size;
-
-            rma_op->request->dev.OnFinal = 0;
-            rma_op->request->dev.OnDataAvail = 0;
-
-            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-            mpi_errno =
-                vc->sendNoncontig_fn(vc, rma_op->request, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
-            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-        }
-    }
-    else {
-        /* derived datatype on target */
-        MPID_Datatype *combined_dtp = NULL;
-
-        rma_op->request = MPID_Request_create();
-        if (rma_op->request == NULL) {
-            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
-        }
-
-        MPIU_Object_set_ref(rma_op->request, 2);
-        rma_op->request->kind = MPID_REQUEST_SEND;
-
-        rma_op->request->dev.segment_ptr = MPID_Segment_alloc();
-        MPIU_ERR_CHKANDJUMP1(rma_op->request->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER,
-                             "**nomem", "**nomem %s", "MPID_Segment_alloc");
-
-        /* create a new datatype containing the dtype_info, dataloop, and origin data */
-
-        mpi_errno =
-            create_datatype(&rma_op->dtype_info, rma_op->dataloop, target_dtp->dataloop_size,
-                            rma_op->origin_addr, rma_op->origin_count, rma_op->origin_datatype,
-                            &combined_dtp);
-        if (mpi_errno)
-            MPIU_ERR_POP(mpi_errno);
-
-        rma_op->request->dev.datatype_ptr = combined_dtp;
-        /* combined_datatype will be freed when request is freed */
-
-        MPID_Segment_init(MPI_BOTTOM, 1, combined_dtp->handle, rma_op->request->dev.segment_ptr, 0);
-        rma_op->request->dev.segment_first = 0;
-        rma_op->request->dev.segment_size = combined_dtp->size;
-
-        rma_op->request->dev.OnFinal = 0;
-        rma_op->request->dev.OnDataAvail = 0;
-
-        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno =
-            vc->sendNoncontig_fn(vc, rma_op->request, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
-        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-
-        /* we're done with the datatypes */
-        if (origin_dt_derived)
-            MPID_Datatype_release(origin_dtp);
-        MPID_Datatype_release(target_dtp);
-    }
-
-    /* This operation can generate two requests; one for inbound and one for
-     * outbound data. */
-    if (resp_req != NULL) {
-        if (rma_op->request != NULL) {
-            /* If we have both inbound and outbound requests (i.e. GACC
-             * operation), we need to ensure that the source buffer is
-             * available and that the response data has been received before
-             * informing the origin that this operation is complete.  Because
-             * the update needs to be done atomically at the target, they will
-             * not send back data until it has been received.  Therefore,
-             * completion of the response request implies that the send request
-             * has completed.
-             *
-             * Therefore: refs on the response request are set to two: one is
-             * held by the progress engine and the other by the RMA op
-             * completion code.  Refs on the outbound request are set to one;
-             * it will be completed by the progress engine.
-             */
-
-            MPID_Request_release(rma_op->request);
-            rma_op->request = resp_req;
-
-        }
-        else {
-            rma_op->request = resp_req;
-        }
-
-        /* For error checking */
-        resp_req = NULL;
-    }
-
-  fn_exit:
-    MPIU_CHKPMEM_COMMIT();
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_SEND_RMA_MSG);
-    return mpi_errno;
-    /* --BEGIN ERROR HANDLING-- */
-  fn_fail:
-    if (resp_req) {
-        MPID_Request_release(resp_req);
-    }
-    if (rma_op->request) {
-        MPIU_CHKPMEM_REAP();
-        if (rma_op->request->dev.datatype_ptr)
-            MPID_Datatype_release(rma_op->request->dev.datatype_ptr);
-        MPID_Request_release(rma_op->request);
-    }
-    rma_op->request = NULL;
-    goto fn_exit;
-    /* --END ERROR HANDLING-- */
-}
-
-/*
- * Use this for contiguous accumulate operations
- */
-#undef FUNCNAME
-#define FUNCNAME send_contig_acc_msg
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int send_contig_acc_msg(MPIDI_RMA_Op_t * rma_op,
-                               MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags)
-{
-    MPIDI_CH3_Pkt_accum_t *accum_pkt = &rma_op->pkt.accum;
-    MPID_IOV iov[MPID_IOV_LIMIT];
-    int mpi_errno = MPI_SUCCESS;
-    int iovcnt;
-    MPI_Aint origin_type_size;
-    MPIDI_VC_t *vc;
-    MPID_Comm *comm_ptr;
-    size_t len;
-    MPIDI_STATE_DECL(MPID_STATE_SEND_CONTIG_ACC_MSG);
-
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_SEND_CONTIG_ACC_MSG);
-
-    rma_op->request = NULL;
-
-    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
-    /* FIXME: Make this size check efficient and match the packet type */
-    MPIU_Assign_trunc(len, rma_op->origin_count * origin_type_size, size_t);
-    if (MPIR_CVAR_CH3_RMA_ACC_IMMED && len <= MPIDI_RMA_IMMED_INTS * sizeof(int)) {
-        MPIDI_CH3_Pkt_accum_immed_t *accumi_pkt = &rma_op->pkt.accum_immed;
-        void *dest = accumi_pkt->data, *src = rma_op->origin_addr;
-
-        accumi_pkt->flags = flags;
-
-        switch (len) {
-        case 1:
-            *(uint8_t *) dest = *(uint8_t *) src;
-            break;
-        case 2:
-            *(uint16_t *) dest = *(uint16_t *) src;
-            break;
-        case 4:
-            *(uint32_t *) dest = *(uint32_t *) src;
-            break;
-        case 8:
-            *(uint64_t *) dest = *(uint64_t *) src;
-            break;
-        default:
-            MPIU_Memcpy(accumi_pkt->data, (void *) rma_op->origin_addr, len);
-        }
-        comm_ptr = win_ptr->comm_ptr;
-        MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
-        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iStartMsg(vc, accumi_pkt, sizeof(*accumi_pkt), &rma_op->request);
-        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-        goto fn_exit;
-    }
-
-    accum_pkt->flags = flags;
-
-    iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) accum_pkt;
-    iov[0].MPID_IOV_LEN = sizeof(*accum_pkt);
-
-    /*    printf("send pkt: type %d, addr %d, count %d, base %d\n", rma_pkt->type,
-     * rma_pkt->addr, rma_pkt->count, win_ptr->base_addrs[rma_op->target_rank]);
-     * fflush(stdout);
-     */
-
-    comm_ptr = win_ptr->comm_ptr;
-    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
-
-
-    /* basic datatype on target */
-    /* basic datatype on origin */
-    /* FIXME: This is still very heavyweight for a small message operation,
-     * such as a single word update */
-    /* One possibility is to use iStartMsg with a buffer that is just large
-     * enough, though note that nemesis has an optimization for this */
-    iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) rma_op->origin_addr;
-    iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
-    iovcnt = 2;
-    MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-    mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, iovcnt, &rma_op->request);
-    MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-    MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-
-  fn_exit:
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_SEND_CONTIG_ACC_MSG);
-    return mpi_errno;
-    /* --BEGIN ERROR HANDLING-- */
-  fn_fail:
-    if (rma_op->request) {
-        MPID_Request_release(rma_op->request);
-    }
-    rma_op->request = NULL;
-    goto fn_exit;
-    /* --END ERROR HANDLING-- */
-}
-
-
-/*
- * Initiate an immediate RMW accumulate operation
- */
-#undef FUNCNAME
-#define FUNCNAME send_immed_rmw_msg
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int send_immed_rmw_msg(MPIDI_RMA_Op_t * rma_op,
-                              MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPID_Request *rmw_req = NULL;
-    MPIDI_VC_t *vc;
-    MPID_Comm *comm_ptr;
-    MPI_Aint len;
-    MPIDI_STATE_DECL(MPID_STATE_SEND_IMMED_RMW_MSG);
-
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_SEND_IMMED_RMW_MSG);
-
-    /* Create a request for the RMW response.  Store the origin buf, count, and
-     * datatype in it, and pass the request's handle RMW packet. When the
-     * response comes from the target, it will contain the request handle. */
-    rma_op->request = MPID_Request_create();
-    MPIU_ERR_CHKANDJUMP(rma_op->request == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
-
-    /* Set refs on the request to 2: one for the response message, and one for
-     * the partial completion handler */
-    MPIU_Object_set_ref(rma_op->request, 2);
-
-    rma_op->request->dev.user_buf = rma_op->result_addr;
-    rma_op->request->dev.user_count = rma_op->result_count;
-    rma_op->request->dev.datatype = rma_op->result_datatype;
-
-    /* REQUIRE: All datatype arguments must be of the same, builtin
-     * type and counts must be 1. */
-    MPID_Datatype_get_size_macro(rma_op->origin_datatype, len);
-    comm_ptr = win_ptr->comm_ptr;
-
-    if (rma_op->pkt.type == MPIDI_CH3_PKT_CAS) {
-        MPIDI_CH3_Pkt_cas_t *cas_pkt = &rma_op->pkt.cas;
-
-        MPIU_Assert(len <= sizeof(MPIDI_CH3_CAS_Immed_u));
-
-        rma_op->request->dev.target_win_handle = cas_pkt->target_win_handle;
-        rma_op->request->dev.source_win_handle = cas_pkt->source_win_handle;
-
-        cas_pkt->request_handle = rma_op->request->handle;
-        cas_pkt->flags = flags;
-
-        MPIU_Memcpy((void *) &cas_pkt->origin_data, rma_op->origin_addr, len);
-        MPIU_Memcpy((void *) &cas_pkt->compare_data, rma_op->compare_addr, len);
-
-        MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
-        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iStartMsg(vc, cas_pkt, sizeof(*cas_pkt), &rmw_req);
-        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-
-        if (rmw_req != NULL) {
-            MPID_Request_release(rmw_req);
-        }
-    }
-
-    else if (rma_op->pkt.type == MPIDI_CH3_PKT_FOP) {
-        MPIDI_CH3_Pkt_fop_t *fop_pkt = &rma_op->pkt.fop;
-
-        MPIU_Assert(len <= sizeof(MPIDI_CH3_FOP_Immed_u));
-
-        rma_op->request->dev.target_win_handle = fop_pkt->target_win_handle;
-        rma_op->request->dev.source_win_handle = fop_pkt->source_win_handle;
-
-        fop_pkt->request_handle = rma_op->request->handle;
-        fop_pkt->flags = flags;
-
-        if (len <= sizeof(fop_pkt->origin_data) || fop_pkt->op == MPI_NO_OP) {
-            /* Embed FOP data in the packet header */
-            if (fop_pkt->op != MPI_NO_OP) {
-                MPIU_Memcpy(fop_pkt->origin_data, rma_op->origin_addr, len);
-            }
-
-            MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
-            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-            mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_pkt, sizeof(*fop_pkt), &rmw_req);
-            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-
-            if (rmw_req != NULL) {
-                MPID_Request_release(rmw_req);
-            }
-        }
-        else {
-            /* Data is too big to copy into the FOP header, use an IOV to send it */
-            MPID_IOV iov[MPID_IOV_LIMIT];
-
-            rmw_req = MPID_Request_create();
-            MPIU_ERR_CHKANDJUMP(rmw_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
-            MPIU_Object_set_ref(rmw_req, 1);
-
-            rmw_req->dev.OnFinal = 0;
-            rmw_req->dev.OnDataAvail = 0;
-
-            iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) fop_pkt;
-            iov[0].MPID_IOV_LEN = sizeof(*fop_pkt);
-            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) rma_op->origin_addr;
-            iov[1].MPID_IOV_LEN = len;  /* count == 1 */
-
-            MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
-            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-            mpi_errno = MPIDI_CH3_iSendv(vc, rmw_req, iov, 2);
-            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-
-            MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-        }
-    }
-    else {
-        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-    }
-
-  fn_exit:
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_SEND_IMMED_RMW_MSG);
-    return mpi_errno;
-    /* --BEGIN ERROR HANDLING-- */
-  fn_fail:
-    if (rma_op->request) {
-        MPID_Request_release(rma_op->request);
-    }
-    rma_op->request = NULL;
-    if (rmw_req) {
-        MPID_Request_release(rmw_req);
-    }
-    goto fn_exit;
-    /* --END ERROR HANDLING-- */
-}
-
-
-
-#undef FUNCNAME
-#define FUNCNAME recv_rma_msg
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int recv_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags)
-{
-    MPIDI_CH3_Pkt_get_t *get_pkt = &rma_op->pkt.get;
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_VC_t *vc;
-    MPID_Comm *comm_ptr;
-    MPID_Datatype *dtp;
-    MPI_Datatype target_datatype;
-    MPID_Request *req = NULL;
-    MPID_IOV iov[MPID_IOV_LIMIT];
-    MPIU_CHKPMEM_DECL(1);
-    MPIDI_STATE_DECL(MPID_STATE_RECV_RMA_MSG);
-    MPIDI_STATE_DECL(MPID_STATE_MEMCPY);
-
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_RECV_RMA_MSG);
-
-    /* create a request, store the origin buf, cnt, datatype in it,
-     * and pass a handle to it in the get packet. When the get
-     * response comes from the target, it will contain the request
-     * handle. */
-    rma_op->request = MPID_Request_create();
-    if (rma_op->request == NULL) {
-        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
-    }
-
-    MPIU_Object_set_ref(rma_op->request, 2);
-
-    rma_op->request->dev.user_buf = rma_op->origin_addr;
-    rma_op->request->dev.user_count = rma_op->origin_count;
-    rma_op->request->dev.datatype = rma_op->origin_datatype;
-    rma_op->request->dev.target_win_handle = MPI_WIN_NULL;
-    rma_op->request->dev.source_win_handle = get_pkt->source_win_handle;
-    if (!MPIR_DATATYPE_IS_PREDEFINED(rma_op->request->dev.datatype)) {
-        MPID_Datatype_get_ptr(rma_op->request->dev.datatype, dtp);
-        rma_op->request->dev.datatype_ptr = dtp;
-        /* this will cause the datatype to be freed when the
-         * request is freed. */
-    }
-
-    get_pkt->request_handle = rma_op->request->handle;
-    get_pkt->flags = flags;
-
-/*    printf("send pkt: type %d, addr %d, count %d, base %d\n", rma_pkt->type,
-           rma_pkt->addr, rma_pkt->count, win_ptr->base_addrs[rma_op->target_rank]);
-    fflush(stdout);
-*/
-
-    comm_ptr = win_ptr->comm_ptr;
-    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
-
-    MPIDI_CH3_PKT_RMA_GET_TARGET_DATATYPE(rma_op->pkt, target_datatype, mpi_errno);
-    if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
-        /* basic datatype on target. simply send the get_pkt. */
-        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iStartMsg(vc, get_pkt, sizeof(*get_pkt), &req);
-        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-    }
-    else {
-        /* derived datatype on target. fill derived datatype info and
-         * send it along with get_pkt. */
-
-        MPID_Datatype_get_ptr(target_datatype, dtp);
-        rma_op->dtype_info.is_contig = dtp->is_contig;
-        rma_op->dtype_info.max_contig_blocks = dtp->max_contig_blocks;
-        rma_op->dtype_info.size = dtp->size;
-        rma_op->dtype_info.extent = dtp->extent;
-        rma_op->dtype_info.dataloop_size = dtp->dataloop_size;
-        rma_op->dtype_info.dataloop_depth = dtp->dataloop_depth;
-        rma_op->dtype_info.eltype = dtp->eltype;
-        rma_op->dtype_info.dataloop = dtp->dataloop;
-        rma_op->dtype_info.ub = dtp->ub;
-        rma_op->dtype_info.lb = dtp->lb;
-        rma_op->dtype_info.true_ub = dtp->true_ub;
-        rma_op->dtype_info.true_lb = dtp->true_lb;
-        rma_op->dtype_info.has_sticky_ub = dtp->has_sticky_ub;
-        rma_op->dtype_info.has_sticky_lb = dtp->has_sticky_lb;
-
-        MPIU_CHKPMEM_MALLOC(rma_op->dataloop, void *, dtp->dataloop_size, mpi_errno, "dataloop");
-
-        MPIDI_FUNC_ENTER(MPID_STATE_MEMCPY);
-        MPIU_Memcpy(rma_op->dataloop, dtp->dataloop, dtp->dataloop_size);
-        MPIDI_FUNC_EXIT(MPID_STATE_MEMCPY);
-
-        /* the dataloop can have undefined padding sections, so we need to let
-         * valgrind know that it is OK to pass this data to writev later on */
-        MPL_VG_MAKE_MEM_DEFINED(rma_op->dataloop, dtp->dataloop_size);
-
-        get_pkt->dataloop_size = dtp->dataloop_size;
-
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*get_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) & rma_op->dtype_info;
-        iov[1].MPID_IOV_LEN = sizeof(rma_op->dtype_info);
-        iov[2].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) rma_op->dataloop;
-        iov[2].MPID_IOV_LEN = dtp->dataloop_size;
-
-        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, 3, &req);
-        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-
-        /* release the target datatype */
-        MPID_Datatype_release(dtp);
-    }
-
-    if (mpi_errno != MPI_SUCCESS) {
-        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-    }
-
-    /* release the request returned by iStartMsg or iStartMsgv */
-    if (req != NULL) {
-        MPID_Request_release(req);
-    }
-
-  fn_exit:
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_RECV_RMA_MSG);
-    return mpi_errno;
-
-    /* --BEGIN ERROR HANDLING-- */
-  fn_fail:
-    MPIU_CHKPMEM_REAP();
-    goto fn_exit;
-    /* --END ERROR HANDLING-- */
-}
-
-
 #undef FUNCNAME
 #define FUNCNAME send_flush_msg
 #undef FCNAME

http://git.mpich.org/mpich.git/commitdiff/e8d4c6d575ee69bcc8e342b7375ce8c061d3dc5b

commit e8d4c6d575ee69bcc8e342b7375ce8c061d3dc5b
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Nov 2 22:47:32 2014 -0600

    Add IMMED area in packet header.
    
    We add a IMMED data area (16 bytes by default) in
    packet header which will contains as much origin
    data as possible. If origin can put all data in
    packet header, then it no longer needs to send
    separate data packet. When target recieves the
    packet header, it will first copy data out from
    the IMMED data area. If there is still more
    data coming, it continues to receive following
    packets; if all data is included in header, then
    recieving is done.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index 467cad9..8f1fcf8 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -23,7 +23,7 @@
 #define MPIDI_EAGER_SHORT_SIZE 16
 
 /* This is the number of ints that can be carried within an RMA packet */
-#define MPIDI_RMA_IMMED_INTS 1
+#define MPIDI_RMA_IMMED_BYTES 16
 
 /* Union over all types (integer, logical, and multi-language types) that are
    allowed in a CAS operation.  This is used to allocate enough space in the
@@ -45,9 +45,6 @@ typedef union {
    Fetch-and-op operation.  This can be too large for the packet header, so we
    limit the immediate space in the header to FOP_IMMED_INTS. */
 
-#define MPIDI_RMA_FOP_IMMED_INTS 2
-#define MPIDI_RMA_FOP_RESP_IMMED_INTS 8
-
 /* *INDENT-OFF* */
 /* Indentation turned off because "indent" is getting confused with
  * the lack of a semi-colon in the fields below */
@@ -411,6 +408,8 @@ typedef struct MPIDI_CH3_Pkt_put {
     MPI_Win source_win_handle;  /* Used in the last RMA operation in an
                                  * epoch in the case of passive target rma
                                  * with shared locks. Otherwise set to NULL*/
+    char data[MPIDI_RMA_IMMED_BYTES];
+    size_t immed_len;
 } MPIDI_CH3_Pkt_put_t;
 
 typedef struct MPIDI_CH3_Pkt_get {
@@ -454,6 +453,8 @@ typedef struct MPIDI_CH3_Pkt_accum {
     MPI_Win source_win_handle;  /* Used in the last RMA operation in an
                                  * epoch in the case of passive target rma
                                  * with shared locks. Otherwise set to NULL*/
+    char data[MPIDI_RMA_IMMED_BYTES];
+    size_t immed_len;
 } MPIDI_CH3_Pkt_accum_t;
 
 typedef struct MPIDI_CH3_Pkt_get_accum {
@@ -472,6 +473,8 @@ typedef struct MPIDI_CH3_Pkt_get_accum {
     MPI_Win source_win_handle;  /* Used in the last RMA operation in an
                                  * epoch in the case of passive target rma
                                  * with shared locks. Otherwise set to NULL*/
+    char data[MPIDI_RMA_IMMED_BYTES];
+    size_t immed_len;
 } MPIDI_CH3_Pkt_get_accum_t;
 
 typedef struct MPIDI_CH3_Pkt_get_accum_resp {
@@ -539,13 +542,15 @@ typedef struct MPIDI_CH3_Pkt_fop {
                                  * epoch for decrementing rma op counter in
                                  * active target rma and for unlocking window
                                  * in passive target rma. Otherwise set to NULL*/
-    int origin_data[MPIDI_RMA_FOP_IMMED_INTS];
+    char data[MPIDI_RMA_IMMED_BYTES];
+    int immed_len;
 } MPIDI_CH3_Pkt_fop_t;
 
 typedef struct MPIDI_CH3_Pkt_fop_resp {
     MPIDI_CH3_Pkt_type_t type;
     MPI_Request request_handle;
-    int data[MPIDI_RMA_FOP_RESP_IMMED_INTS];
+    char data[MPIDI_RMA_IMMED_BYTES];
+    int immed_len;
     /* followings are used to decrement ack_counter at orign */
     int target_rank;
     MPI_Win source_win_handle;
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index 93856ab..0f522af 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -465,6 +465,7 @@ typedef struct MPIDI_Request {
     MPI_Op op;
     /* For accumulate, since data is first read into a tmp_buf */
     void *real_user_buf;
+    void *final_user_buf;
     /* For derived datatypes at target */
     struct MPIDI_RMA_dtype_info *dtype_info;
     void *dataloop;
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index e690d05..10db50b 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -307,10 +307,12 @@ static inline int do_accumulate_op(MPID_Request *rreq)
 
     MPIDI_FUNC_ENTER(MPID_STATE_DO_ACCUMULATE_OP);
 
+    MPIU_Assert(rreq->dev.final_user_buf != NULL);
+
     if (rreq->dev.op == MPI_REPLACE)
     {
         /* simply copy the data */
-        mpi_errno = MPIR_Localcopy(rreq->dev.user_buf, rreq->dev.user_count,
+        mpi_errno = MPIR_Localcopy(rreq->dev.final_user_buf, rreq->dev.user_count,
                                    rreq->dev.datatype,
                                    rreq->dev.real_user_buf,
                                    rreq->dev.user_count,
@@ -336,7 +338,7 @@ static inline int do_accumulate_op(MPID_Request *rreq)
 
     if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype))
     {
-        (*uop)(rreq->dev.user_buf, rreq->dev.real_user_buf,
+        (*uop)(rreq->dev.final_user_buf, rreq->dev.real_user_buf,
                &(rreq->dev.user_count), &(rreq->dev.datatype));
     }
     else
@@ -385,7 +387,7 @@ static inline int do_accumulate_op(MPID_Request *rreq)
         for (i=0; i<vec_len; i++)
 	{
             MPIU_Assign_trunc(count, (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size, int);
-            (*uop)((char *)rreq->dev.user_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
+            (*uop)((char *)rreq->dev.final_user_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
                    (char *)rreq->dev.real_user_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
                    &count, &type);
         }
@@ -397,7 +399,7 @@ static inline int do_accumulate_op(MPID_Request *rreq)
  fn_exit:
     /* free the temporary buffer */
     MPIR_Type_get_true_extent_impl(rreq->dev.datatype, &true_lb, &true_extent);
-    MPIU_Free((char *) rreq->dev.user_buf + true_lb);
+    MPIU_Free((char *) rreq->dev.final_user_buf + true_lb);
 
     MPIDI_FUNC_EXIT(MPID_STATE_DO_ACCUMULATE_OP);
 
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index a9201f0..67e97a1 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -371,6 +371,7 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((u
     tmp_buf = (void *)((char*)tmp_buf - true_lb);
     
     rreq->dev.user_buf = tmp_buf;
+    rreq->dev.final_user_buf = rreq->dev.user_buf;
     rreq->dev.datatype = new_dtp->handle;
     rreq->dev.recv_data_sz = new_dtp->size *
 	rreq->dev.user_count; 
@@ -442,6 +443,7 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((
     tmp_buf = (void *)((char*)tmp_buf - true_lb);
 
     rreq->dev.user_buf = tmp_buf;
+    rreq->dev.final_user_buf = rreq->dev.user_buf;
     rreq->dev.datatype = new_dtp->handle;
     rreq->dev.recv_data_sz = new_dtp->size *
 	rreq->dev.user_count;
diff --git a/src/mpid/ch3/src/ch3u_request.c b/src/mpid/ch3/src/ch3u_request.c
index d7602f3..e229f2f 100644
--- a/src/mpid/ch3/src/ch3u_request.c
+++ b/src/mpid/ch3/src/ch3u_request.c
@@ -92,6 +92,7 @@ MPID_Request * MPID_Request_create(void)
         req->dev.OnDataAvail       = NULL;
         req->dev.OnFinal           = NULL;
         req->dev.user_buf          = NULL;
+        req->dev.final_user_buf    = NULL;
 #ifdef MPIDI_CH3_REQUEST_INIT
 	MPIDI_CH3_REQUEST_INIT(req);
 #endif
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index a110a26..377eb67 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -109,6 +109,7 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
         put_pkt->dataloop_size = 0;
         put_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
         put_pkt->source_win_handle = win_ptr->handle;
+        put_pkt->immed_len = 0;
 
         /* FIXME: For contig and very short operations, use a streamlined op */
         new_ptr->origin_addr = (void *) origin_addr;
@@ -133,6 +134,28 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
             new_ptr->is_dt = 1;
         }
 
+        /* If both origin and target are basic datatype, try to
+           copy origin data to packet header as much as possible. */
+        if (!new_ptr->is_dt) {
+            size_t len;
+            MPI_Aint origin_type_size;
+
+            MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
+            /* length of origin data */
+            MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
+            /* length of origin data that can fit into immed area in pkt header */
+            MPIU_Assign_trunc(put_pkt->immed_len,
+                              MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
+                              size_t);
+
+            if (put_pkt->immed_len > 0) {
+                void *src = new_ptr->origin_addr, *dest = put_pkt->data;
+                /* copy data from origin buffer to immed area in packet header */
+                mpi_errno = immed_copy(src, dest, put_pkt->immed_len);
+                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            }
+        }
+
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
@@ -418,6 +441,7 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
         accum_pkt->op = op;
         accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
         accum_pkt->source_win_handle = win_ptr->handle;
+        accum_pkt->immed_len = 0;
 
         new_ptr->origin_addr = (void *) origin_addr;
         new_ptr->origin_count = origin_count;
@@ -441,6 +465,28 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
             new_ptr->is_dt = 1;
         }
 
+        /* If both origin and target are basic datatype, try to
+           copy origin data to packet header as much as possible. */
+        if (!new_ptr->is_dt) {
+            size_t len;
+            MPI_Aint origin_type_size;
+
+            MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
+            /* length of origin data */
+            MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
+            /* length of origin data that can fit into immed areas in packet header */
+            MPIU_Assign_trunc(accum_pkt->immed_len,
+                              MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
+                              size_t);
+
+            if (accum_pkt->immed_len > 0) {
+                void *src = new_ptr->origin_addr, *dest = accum_pkt->data;
+                /* copy data from origin buffer to immed area in packet header */
+                mpi_errno = immed_copy(src, dest, accum_pkt->immed_len);
+                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            }
+        }
+
  issue_ops:
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
@@ -590,6 +636,7 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
             get_accum_pkt->op = op;
             get_accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
             get_accum_pkt->source_win_handle = win_ptr->handle;
+            get_accum_pkt->immed_len = 0;
 
             new_ptr->origin_addr = (void *) origin_addr;
             new_ptr->origin_count = origin_count;
@@ -616,6 +663,28 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
                 MPID_Datatype_add_ref(dtp);
                 new_ptr->is_dt = 1;
             }
+
+            /* If all buffers are basic datatype, try to copy origin data to
+               packet header as much as possible. */
+            if (!new_ptr->is_dt) {
+                size_t len;
+                MPI_Aint origin_type_size;
+
+                MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
+                /* length of origin data */
+                MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
+                /* length of origin data that can fit into immed area in packet header */
+                MPIU_Assign_trunc(get_accum_pkt->immed_len,
+                                  MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
+                                  size_t);
+
+                if (get_accum_pkt->immed_len > 0) {
+                    void *src = new_ptr->origin_addr, *dest = get_accum_pkt->data;
+                    /* copy data from origin buffer to immed area in packet header */
+                    mpi_errno = immed_copy(src, dest, get_accum_pkt->immed_len);
+                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                }
+            }
         }
 
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
@@ -857,6 +926,9 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
         }
         else {
             MPIDI_CH3_Pkt_fop_t *fop_pkt = &(new_ptr->pkt.fop);
+            size_t len;
+            MPI_Aint origin_type_size;
+
             MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);
             fop_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                 win_ptr->disp_units[target_rank] * target_disp;
@@ -864,6 +936,7 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             fop_pkt->op = op;
             fop_pkt->source_win_handle = win_ptr->handle;
             fop_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+            fop_pkt->immed_len = 0;
 
             new_ptr->origin_addr = (void *) origin_addr;
             new_ptr->origin_count = 1;
@@ -871,6 +944,21 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             new_ptr->result_addr = result_addr;
             new_ptr->result_datatype = datatype;
             new_ptr->target_rank = target_rank;
+
+            MPID_Datatype_get_size_macro(new_ptr->origin_datatype, origin_type_size);
+            /* length of origin data */
+            MPIU_Assign_trunc(len, new_ptr->origin_count * origin_type_size, size_t);
+            /* length of origin data that can fit into immed area in pkt header */
+            MPIU_Assign_trunc(fop_pkt->immed_len,
+                              MPIR_MIN(len, (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size),
+                              size_t);
+
+            if (fop_pkt->immed_len > 0) {
+                void *src = new_ptr->origin_addr, *dest = fop_pkt->data;
+                /* copy data from origin buffer to immed area in packet header */
+                mpi_errno = immed_copy(src, dest, fop_pkt->immed_len);
+                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            }
         }
 
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 2e0b647..8daa90a 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -59,6 +59,26 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         MPID_Datatype_get_size_macro(put_pkt->datatype, type_size);
         req->dev.recv_data_sz = type_size * put_pkt->count;
 
+        if (put_pkt->immed_len > 0) {
+            /* See if we can receive some data from packet header. */
+            MPIU_Memcpy(req->dev.user_buf, put_pkt->data, put_pkt->immed_len);
+            req->dev.user_buf = (void*)((char*)req->dev.user_buf + put_pkt->immed_len);
+            req->dev.recv_data_sz -= put_pkt->immed_len;
+        }
+
+        if (req->dev.recv_data_sz == 0) {
+            /* All data received, trigger req handler. */
+
+            *buflen = sizeof(MPIDI_CH3_Pkt_t);
+            mpi_errno = MPIDI_CH3_ReqHandler_PutRecvComplete(vc, req, &complete);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+            if (complete) {
+                *rreqp = NULL;
+                goto fn_exit;
+            }
+        }
+
         mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
         MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
                              "**ch3|postrecv %s", "MPIDI_CH3_PKT_PUT");
@@ -349,10 +369,29 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         }
 
         req->dev.user_buf = tmp_buf;
+        req->dev.final_user_buf = req->dev.user_buf;
 
         MPID_Datatype_get_size_macro(accum_pkt->datatype, type_size);
         req->dev.recv_data_sz = type_size * accum_pkt->count;
 
+        if (accum_pkt->immed_len > 0) {
+            /* See if we can receive some data from packet header. */
+            MPIU_Memcpy(req->dev.user_buf, accum_pkt->data, accum_pkt->immed_len);
+            req->dev.user_buf = (void*)((char*)req->dev.user_buf + accum_pkt->immed_len);
+            req->dev.recv_data_sz -= accum_pkt->immed_len;
+        }
+
+        if (req->dev.recv_data_sz == 0) {
+            /* All data received, trigger req handler. */
+            *buflen = sizeof(MPIDI_CH3_Pkt_t);
+            mpi_errno = MPIDI_CH3_ReqHandler_AccumRecvComplete(vc, req, &complete);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (complete) {
+                *rreqp = NULL;
+                goto fn_exit;
+            }
+        }
+
         mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
         MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
                              "**ch3|postrecv %s", "MPIDI_CH3_PKT_ACCUMULATE");
@@ -496,10 +535,31 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         }
 
         req->dev.user_buf = tmp_buf;
+        req->dev.final_user_buf = req->dev.user_buf;
 
         MPID_Datatype_get_size_macro(get_accum_pkt->datatype, type_size);
         req->dev.recv_data_sz = type_size * get_accum_pkt->count;
 
+        if (get_accum_pkt->immed_len > 0) {
+            /* See if we can receive some data from packet header. */
+            MPIU_Memcpy(req->dev.user_buf, get_accum_pkt->data, get_accum_pkt->immed_len);
+            req->dev.user_buf = (void*)((char*)req->dev.user_buf + get_accum_pkt->immed_len);
+            req->dev.recv_data_sz -= get_accum_pkt->immed_len;
+        }
+
+        if (req->dev.recv_data_sz == 0) {
+            /* All data received. */
+
+            *buflen = sizeof(MPIDI_CH3_Pkt_t);
+
+            mpi_errno = MPIDI_CH3_ReqHandler_GaccumRecvComplete(vc, req, &complete);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (complete) {
+                *rreqp = NULL;
+                goto fn_exit;
+            }
+        }
+
         mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
         MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
                              "**ch3|postrecv %s", "MPIDI_CH3_PKT_ACCUMULATE");

http://git.mpich.org/mpich.git/commitdiff/1c638a123cef60a81e9f7a91a8dbaa6968a9854c

commit 1c638a123cef60a81e9f7a91a8dbaa6968a9854c
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Oct 28 14:54:14 2014 -0500

    Add useful pkt wrappers.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index 205f2eb..467cad9 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -198,24 +198,24 @@ MPIDI_CH3_PKT_DEFS
 #define MPIDI_CH3_PKT_RMA_GET_TARGET_DATATYPE(pkt_, datatype_, err_)    \
     {                                                                   \
         err_ = MPI_SUCCESS;                                             \
-        switch(pkt_.type) {                                             \
+        switch((pkt_).type) {                                           \
         case (MPIDI_CH3_PKT_PUT):                                       \
-            datatype_ = pkt_.put.datatype;                              \
+            datatype_ = (pkt_).put.datatype;                            \
             break;                                                      \
         case (MPIDI_CH3_PKT_GET):                                       \
-            datatype_ = pkt_.get.datatype;                              \
+            datatype_ = (pkt_).get.datatype;                            \
             break;                                                      \
         case (MPIDI_CH3_PKT_ACCUMULATE):                                \
-            datatype_ = pkt_.accum.datatype;                            \
+            datatype_ = (pkt_).accum.datatype;                          \
             break;                                                      \
         case (MPIDI_CH3_PKT_GET_ACCUM):                                 \
-            datatype_ = pkt_.get_accum.datatype;                        \
+            datatype_ = (pkt_).get_accum.datatype;                      \
             break;                                                      \
         case (MPIDI_CH3_PKT_CAS):                                       \
-            datatype_ = pkt_.cas.datatype;                              \
+            datatype_ = (pkt_).cas.datatype;                            \
             break;                                                      \
         case (MPIDI_CH3_PKT_FOP):                                       \
-            datatype_ = pkt_.fop.datatype;                              \
+            datatype_ = (pkt_).fop.datatype;                            \
             break;                                                      \
         case (MPIDI_CH3_PKT_LOCK_PUT_UNLOCK):                           \
             datatype_ = pkt_.lock_put_unlock.datatype;                  \
@@ -230,10 +230,173 @@ MPIDI_CH3_PKT_DEFS
             datatype_ = pkt_.accum_immed.datatype;                      \
             break;                                                      \
         default:                                                        \
-            MPIU_ERR_SETANDJUMP1(err_, MPI_ERR_OTHER, "**invalidpkt", "**invalidpkt %d", pkt_.type); \
+            MPIU_ERR_SETANDJUMP1(err_, MPI_ERR_OTHER, "**invalidpkt", "**invalidpkt %d", (pkt_).type); \
         }                                                               \
     }
 
+#define MPIDI_CH3_PKT_RMA_GET_LOCK_TYPE(pkt_, lock_type_, err_)         \
+    {                                                                   \
+        err_ = MPI_SUCCESS;                                             \
+        switch((pkt_).type) {                                           \
+        case (MPIDI_CH3_PKT_PUT):                                       \
+            lock_type_ = (pkt_).put.lock_type;                          \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET):                                       \
+            lock_type_ = (pkt_).get.lock_type;                          \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_ACCUMULATE):                                \
+            lock_type_ = (pkt_).accum.lock_type;                        \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET_ACCUM):                                 \
+            lock_type_ = (pkt_).get_accum.lock_type;                    \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_CAS):                                       \
+            lock_type_ = (pkt_).cas.lock_type;                          \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_FOP):                                       \
+            lock_type_ = (pkt_).fop.lock_type;                          \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_LOCK):                                      \
+            lock_type_ = (pkt_).lock.lock_type;                         \
+            break;                                                      \
+        default:                                                        \
+            MPIU_ERR_SETANDJUMP1(err_, MPI_ERR_OTHER, "**invalidpkt", "**invalidpkt %d", (pkt_).type); \
+        }                                                               \
+    }
+
+#define MPIDI_CH3_PKT_RMA_GET_ORIGIN_RANK(pkt_, origin_rank_, err_)     \
+    {                                                                   \
+        err_ = MPI_SUCCESS;                                             \
+        switch((pkt_).type) {                                           \
+        case (MPIDI_CH3_PKT_PUT):                                       \
+            origin_rank_ = (pkt_).put.origin_rank;                      \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET):                                       \
+            origin_rank_ = (pkt_).get.origin_rank;                      \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_ACCUMULATE):                                \
+            origin_rank_ = (pkt_).accum.origin_rank;                    \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET_ACCUM):                                 \
+            origin_rank_ = (pkt_).get_accum.origin_rank;                \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_CAS):                                       \
+            origin_rank_ = (pkt_).cas.origin_rank;                      \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_FOP):                                       \
+            origin_rank_ = (pkt_).fop.origin_rank;                      \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_LOCK):                                      \
+            origin_rank_ = (pkt_).lock.origin_rank;                     \
+            break;                                                      \
+        default:                                                        \
+            MPIU_ERR_SETANDJUMP1(err_, MPI_ERR_OTHER, "**invalidpkt", "**invalidpkt %d", (pkt_).type); \
+        }                                                               \
+    }
+
+#define MPIDI_CH3_PKT_RMA_GET_FLAGS(pkt_, flags_, err_)                 \
+    {                                                                   \
+        err_ = MPI_SUCCESS;                                             \
+        switch((pkt_).type) {                                           \
+        case (MPIDI_CH3_PKT_PUT):                                       \
+            flags_ = (pkt_).put.flags;                                  \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET):                                       \
+            flags_ = (pkt_).get.flags;                                  \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_ACCUMULATE):                                \
+            flags_ = (pkt_).accum.flags;                                \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET_ACCUM):                                 \
+            flags_ = (pkt_).get_accum.flags;                            \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_CAS):                                       \
+            flags_ = (pkt_).cas.flags;                                  \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_FOP):                                       \
+            flags_ = (pkt_).fop.flags;                                  \
+            break;                                                      \
+        default:                                                        \
+            MPIU_ERR_SETANDJUMP1(err_, MPI_ERR_OTHER, "**invalidpkt", "**invalidpkt %d", (pkt_).type); \
+        }                                                               \
+    }
+
+#define MPIDI_CH3_PKT_RMA_UNSET_FLAG(pkt_, flag_, err_)                 \
+    {                                                                   \
+        err_ = MPI_SUCCESS;                                             \
+        switch((pkt_).type) {                                           \
+        case (MPIDI_CH3_PKT_PUT):                                       \
+            (pkt_).put.flags &= ~(flag_);                               \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET):                                       \
+            (pkt_).get.flags &= ~(flag_);                               \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_ACCUMULATE):                                \
+            (pkt_).accum.flags &= ~(flag_);                             \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET_ACCUM):                                 \
+            (pkt_).get_accum.flags &= ~(flag_);                         \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_CAS):                                       \
+            (pkt_).cas.flags &= ~(flag_);                               \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_FOP):                                       \
+            (pkt_).fop.flags &= ~(flag_);                               \
+            break;                                                      \
+        default:                                                        \
+            MPIU_ERR_SETANDJUMP1(err_, MPI_ERR_OTHER, "**invalidpkt", "**invalidpkt %d", (pkt_).type); \
+        }                                                               \
+    }
+
+#define MPIDI_CH3_PKT_RMA_SET_FLAG(pkt_, flag_, err_)                   \
+    {                                                                   \
+        err_ = MPI_SUCCESS;                                             \
+        switch((pkt_).type) {                                           \
+        case (MPIDI_CH3_PKT_PUT):                                       \
+            (pkt_).put.flags |= (flag_);                                \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET):                                       \
+            (pkt_).get.flags |= (flag_);                                \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_ACCUMULATE):                                \
+            (pkt_).accum.flags |= (flag_);                              \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET_ACCUM):                                 \
+            (pkt_).get_accum.flags |= (flag_);                          \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_CAS):                                       \
+            (pkt_).cas.flags |= (flag_);                                \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_FOP):                                       \
+            (pkt_).fop.flags |= (flag_);                                \
+            break;                                                      \
+        default:                                                        \
+            MPIU_ERR_SETANDJUMP1(err_, MPI_ERR_OTHER, "**invalidpkt", "**invalidpkt %d", (pkt_).type); \
+        }                                                               \
+    }
+
+#define MPIDI_CH3_PKT_RMA_SET_DATALOOP_SIZE(pkt_, dataloop_size_, err_) \
+    {                                                                   \
+        err_ = MPI_SUCCESS;                                             \
+        switch((pkt_).type) {                                           \
+        case (MPIDI_CH3_PKT_PUT):                                       \
+            (pkt_).put.dataloop_size = (dataloop_size_);                \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET):                                       \
+            (pkt_).get.dataloop_size = (dataloop_size_);                \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_ACCUMULATE):                                \
+            (pkt_).accum.dataloop_size = (dataloop_size_);              \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET_ACCUM):                                 \
+            (pkt_).get_accum.dataloop_size = (dataloop_size_);          \
+            break;                                                      \
+        default:                                                        \
+            MPIU_ERR_SETANDJUMP1(err_, MPI_ERR_OTHER, "**invalidpkt", "**invalidpkt %d", (pkt_).type); \
+        }                                                               \
+    }
+
+
 typedef struct MPIDI_CH3_Pkt_put {
     MPIDI_CH3_Pkt_type_t type;
     MPIDI_CH3_Pkt_flags_t flags;

http://git.mpich.org/mpich.git/commitdiff/d129eed393874a2076dadf3d28d541b66a3af142

commit d129eed393874a2076dadf3d28d541b66a3af142
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Nov 2 22:43:47 2014 -0600

    code refactoring in operation routines.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index b6a9084..a110a26 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -566,6 +566,17 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
             new_ptr->origin_count = result_count;
             new_ptr->origin_datatype = result_datatype;
             new_ptr->target_rank = target_rank;
+
+            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
+                MPID_Datatype_get_ptr(result_datatype, dtp);
+                MPID_Datatype_add_ref(dtp);
+                new_ptr->is_dt = 1;
+            }
+            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
+                MPID_Datatype_get_ptr(target_datatype, dtp);
+                MPID_Datatype_add_ref(dtp);
+                new_ptr->is_dt = 1;
+            }
         }
 
         else {
@@ -587,30 +598,30 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
             new_ptr->result_count = result_count;
             new_ptr->result_datatype = result_datatype;
             new_ptr->target_rank = target_rank;
+
+            /* if source or target datatypes are derived, increment their
+             * reference counts */
+            if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
+                MPID_Datatype_get_ptr(origin_datatype, dtp);
+                MPID_Datatype_add_ref(dtp);
+                new_ptr->is_dt = 1;
+            }
+            if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
+                MPID_Datatype_get_ptr(result_datatype, dtp);
+                MPID_Datatype_add_ref(dtp);
+                new_ptr->is_dt = 1;
+            }
+            if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
+                MPID_Datatype_get_ptr(target_datatype, dtp);
+                MPID_Datatype_add_ref(dtp);
+                new_ptr->is_dt = 1;
+            }
         }
 
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
 
-        /* if source or target datatypes are derived, increment their
-         * reference counts */
-        if (op != MPI_NO_OP && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
-            MPID_Datatype_get_ptr(origin_datatype, dtp);
-            MPID_Datatype_add_ref(dtp);
-            new_ptr->is_dt = 1;
-        }
-        if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
-            MPID_Datatype_get_ptr(result_datatype, dtp);
-            MPID_Datatype_add_ref(dtp);
-            new_ptr->is_dt = 1;
-        }
-        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
-            MPID_Datatype_get_ptr(target_datatype, dtp);
-            MPID_Datatype_add_ref(dtp);
-            new_ptr->is_dt = 1;
-        }
-
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
@@ -822,27 +833,45 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
     }
     else {
         MPIDI_RMA_Op_t *new_ptr = NULL;
-        MPIDI_CH3_Pkt_fop_t *fop_pkt = NULL;
 
         /* Append this operation to the RMA ops queue */
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
         if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-        fop_pkt = &(new_ptr->pkt.fop);
-        MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);
-        fop_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
-            win_ptr->disp_units[target_rank] * target_disp;
-        fop_pkt->datatype = datatype;
-        fop_pkt->op = op;
-        fop_pkt->source_win_handle = win_ptr->handle;
-        fop_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+        if (op == MPI_NO_OP) {
+            /* Convert FOP to a Get */
+            MPIDI_CH3_Pkt_get_t *get_pkt = &(new_ptr->pkt.get);
+            MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
+            get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
+                win_ptr->disp_units[target_rank] * target_disp;
+            get_pkt->count = 1;
+            get_pkt->datatype = datatype;
+            get_pkt->dataloop_size = 0;
+            get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+            get_pkt->source_win_handle = win_ptr->handle;
 
-        new_ptr->origin_addr = (void *) origin_addr;
-        new_ptr->origin_count = 1;
-        new_ptr->origin_datatype = datatype;
-        new_ptr->result_addr = result_addr;
-        new_ptr->result_datatype = datatype;
-        new_ptr->target_rank = target_rank;
+            new_ptr->origin_addr = result_addr;
+            new_ptr->origin_count = 1;
+            new_ptr->origin_datatype = datatype;
+            new_ptr->target_rank = target_rank;
+        }
+        else {
+            MPIDI_CH3_Pkt_fop_t *fop_pkt = &(new_ptr->pkt.fop);
+            MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);
+            fop_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
+                win_ptr->disp_units[target_rank] * target_disp;
+            fop_pkt->datatype = datatype;
+            fop_pkt->op = op;
+            fop_pkt->source_win_handle = win_ptr->handle;
+            fop_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+
+            new_ptr->origin_addr = (void *) origin_addr;
+            new_ptr->origin_count = 1;
+            new_ptr->origin_datatype = datatype;
+            new_ptr->result_addr = result_addr;
+            new_ptr->result_datatype = datatype;
+            new_ptr->target_rank = target_rank;
+        }
 
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
         if (mpi_errno)

http://git.mpich.org/mpich.git/commitdiff/b73778ea104768f493165133ea4dc196c54c5d0c

commit b73778ea104768f493165133ea4dc196c54c5d0c
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Fri Oct 31 11:22:32 2014 -0500

    Decrement Active Target counter at target side.
    
    During PSCW, when there are active-message operations
    to be issued in Win_complete, we piggback a AT_COMPLETE
    flag with it so that when target receives it, it can
    decrement a counter on target side and detect completion
    when target counter reaches zero.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index f79d384..5892eab 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -116,6 +116,7 @@ static inline MPIDI_RMA_Target_t *MPIDI_CH3I_Win_target_alloc(MPID_Win * win_ptr
     e->lock_mode = 0;
     e->outstanding_lock = 0;
     e->disable_flush_local = 0;
+    e->win_complete_flag = 0;
 
     e->sync.sync_flag = MPIDI_RMA_NONE;
     e->sync.outstanding_acks = 0;
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index 27d137a..3047231 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -96,6 +96,7 @@ typedef struct MPIDI_RMA_Target {
     int lock_mode;              /* e.g., MODE_NO_CHECK */
     int outstanding_lock;
     int disable_flush_local;
+    int win_complete_flag;
 
     /* The target structure is free to be cleaned up when all of the
      * following conditions hold true:
diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index 9eafc3c..2e7647e 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -1815,6 +1815,8 @@ int MPIDI_CH3_PktHandler_Flush( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
                                 MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_FlushAck( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
 				    MPIDI_msg_sz_t *, MPID_Request ** );
+int MPIDI_CH3_PktHandler_DecrAtCnt( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
+                                    MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_LockPutUnlock( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
 					MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_LockAccumUnlock( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index 71a0884..205f2eb 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -89,6 +89,7 @@ typedef enum {
     MPIDI_CH3_PKT_UNLOCK,
     MPIDI_CH3_PKT_FLUSH,
     MPIDI_CH3_PKT_FLUSH_ACK,
+    MPIDI_CH3_PKT_DECR_AT_COUNTER,
     MPIDI_CH3_PKT_LOCK_PUT_UNLOCK,      /* optimization for single puts */
     MPIDI_CH3_PKT_LOCK_GET_UNLOCK,      /* optimization for single gets */
     MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK,    /* optimization for single accumulates */
@@ -121,7 +122,7 @@ typedef enum {
     MPIDI_CH3_PKT_FLAG_RMA_UNLOCK = 2,
     MPIDI_CH3_PKT_FLAG_RMA_FLUSH = 4,
     MPIDI_CH3_PKT_FLAG_RMA_REQ_ACK = 8,
-    MPIDI_CH3_PKT_FLAG_RMA_AT_COMPLETE = 16,
+    MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER = 16,
     MPIDI_CH3_PKT_FLAG_RMA_NOCHECK = 32,
     MPIDI_CH3_PKT_FLAG_RMA_SHARED = 64,
     MPIDI_CH3_PKT_FLAG_RMA_EXCLUSIVE = 128,
@@ -432,6 +433,11 @@ typedef struct MPIDI_CH3_Pkt_flush_ack {
                                  * target state at the origin. */
 } MPIDI_CH3_Pkt_flush_ack_t;
 
+typedef struct MPIDI_CH3_Pkt_decr_at_counter {
+    MPIDI_CH3_Pkt_type_t type;
+    MPI_Win target_win_handle;
+} MPIDI_CH3_Pkt_decr_at_counter_t;
+
 typedef struct MPIDI_CH3_Pkt_lock_put_unlock {
     MPIDI_CH3_Pkt_type_t type;
     MPIDI_CH3_Pkt_flags_t flags;
@@ -506,6 +512,7 @@ typedef union MPIDI_CH3_Pkt {
     MPIDI_CH3_Pkt_unlock_t unlock;
     MPIDI_CH3_Pkt_flush_t flush;
     MPIDI_CH3_Pkt_flush_ack_t flush_ack;
+    MPIDI_CH3_Pkt_decr_at_counter_t decr_at_cnt;
     MPIDI_CH3_Pkt_lock_put_unlock_t lock_put_unlock;
     MPIDI_CH3_Pkt_lock_get_unlock_t lock_get_unlock;
     MPIDI_CH3_Pkt_lock_accum_unlock_t lock_accum_unlock;
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index b534fb3..e690d05 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -187,6 +187,46 @@ static inline int MPIDI_CH3I_Send_flush_ack_pkt(MPIDI_VC_t *vc, MPID_Win *win_pt
 
 
 #undef FUNCNAME
+#define FUNCNAME send_decr_at_cnt_msg
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int send_decr_at_cnt_msg(int dst, MPID_Win * win_ptr)
+{
+    MPIDI_CH3_Pkt_t upkt;
+    MPIDI_CH3_Pkt_decr_at_counter_t *decr_at_cnt_pkt = &upkt.decr_at_cnt;
+    MPIDI_VC_t * vc;
+    MPID_Request *request = NULL;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_SEND_DECR_AT_CNT_MSG);
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_SEND_DECR_AT_CNT_MSG);
+
+    MPIDI_Pkt_init(decr_at_cnt_pkt, MPIDI_CH3_PKT_DECR_AT_COUNTER);
+    decr_at_cnt_pkt->target_win_handle = win_ptr->all_win_handles[dst];
+
+    MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr, dst, &vc);
+
+    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
+    mpi_errno = MPIDI_CH3_iStartMsg(vc, decr_at_cnt_pkt,
+                                    sizeof(*decr_at_cnt_pkt), &request);
+    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|rmamsg" );
+    }
+
+    if (request != NULL) {
+        MPID_Request_release(request);
+    }
+
+  fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_SEND_DECR_AT_CNT_MSG);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+  fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+#undef FUNCNAME
 #define FUNCNAME acquire_local_lock
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_pkt.c b/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
index 3c80f5b..dd91f39 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
@@ -588,6 +588,8 @@ int MPIDI_CH3_PktHandler_Init( MPIDI_CH3_PktHandler_Fcn *pktArray[],
         MPIDI_CH3_PktHandler_Flush;
     pktArray[MPIDI_CH3_PKT_FLUSH_ACK] =
 	MPIDI_CH3_PktHandler_FlushAck;
+    pktArray[MPIDI_CH3_PKT_DECR_AT_COUNTER] =
+        MPIDI_CH3_PktHandler_DecrAtCnt;
     pktArray[MPIDI_CH3_PKT_LOCK_PUT_UNLOCK] = 
 	MPIDI_CH3_PktHandler_LockPutUnlock;
     pktArray[MPIDI_CH3_PKT_LOCK_ACCUM_UNLOCK] =
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 41bea08..a9201f0 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -93,6 +93,13 @@ int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *vc,
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIDI_CH3_Progress_signal_completion();
     }
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER) {
+        win_ptr->at_completion_counter--;
+        MPIU_Assert(win_ptr->at_completion_counter >= 0);
+        /* Signal the local process when the op counter reaches 0. */
+        if (win_ptr->at_completion_counter == 0)
+            MPIDI_CH3_Progress_signal_completion();
+    }
 
     mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
                                                rreq->dev.source_win_handle);
@@ -145,6 +152,13 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *vc,
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIDI_CH3_Progress_signal_completion();
     }
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER) {
+        win_ptr->at_completion_counter--;
+        MPIU_Assert(win_ptr->at_completion_counter >= 0);
+        /* Signal the local process when the op counter reaches 0. */
+        if (win_ptr->at_completion_counter == 0)
+            MPIDI_CH3_Progress_signal_completion();
+    }
 
     mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
                                                rreq->dev.source_win_handle);
@@ -720,6 +734,14 @@ int MPIDI_CH3_ReqHandler_FOPComplete( MPIDI_VC_t *vc,
             }
         }
 
+        if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER) {
+            win_ptr->at_completion_counter--;
+            MPIU_Assert(win_ptr->at_completion_counter >= 0);
+            /* Signal the local process when the op counter reaches 0. */
+            if (win_ptr->at_completion_counter == 0)
+                MPIDI_CH3_Progress_signal_completion();
+        }
+
         /* There are additional steps to take if this is a passive
            target RMA or the last operation from the source */
         mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
diff --git a/src/mpid/ch3/src/ch3u_handle_send_req.c b/src/mpid/ch3/src/ch3u_handle_send_req.c
index abe1052..c3ede3f 100644
--- a/src/mpid/ch3/src/ch3u_handle_send_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_send_req.c
@@ -52,6 +52,14 @@ int MPIDI_CH3_ReqHandler_GetSendComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
 
     MPID_Win_get_ptr(sreq->dev.target_win_handle, win_ptr);
 
+    if (sreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER) {
+        win_ptr->at_completion_counter--;
+        MPIU_Assert(win_ptr->at_completion_counter >= 0);
+        /* Signal the local process when the op counter reaches 0. */
+        if (win_ptr->at_completion_counter == 0)
+            MPIDI_CH3_Progress_signal_completion();
+    }
+
     mpi_errno = MPIDI_CH3_Finish_rma_op_target(NULL, win_ptr, FALSE, sreq->dev.flags, MPI_WIN_NULL);
     if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
 
@@ -96,6 +104,14 @@ int MPIDI_CH3_ReqHandler_GaccumLikeSendComplete( MPIDI_VC_t *vc,
     win_ptr->at_completion_counter--;
     MPIU_Assert(win_ptr->at_completion_counter >= 0);
 
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER) {
+        win_ptr->at_completion_counter--;
+        MPIU_Assert(win_ptr->at_completion_counter >= 0);
+        /* Signal the local process when the op counter reaches 0. */
+        if (win_ptr->at_completion_counter == 0)
+            MPIDI_CH3_Progress_signal_completion();
+    }
+
     MPIDI_CH3U_Request_complete(rreq);
     *complete = TRUE;
  fn_exit:
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 89fa944..2e0b647 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -36,24 +36,6 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received put pkt");
 
-    if (put_pkt->count == 0) {
-        /* it's a 0-byte message sent just to decrement the
-         * completion counter. This happens only in
-         * post/start/complete/wait sync model; therefore, no need
-         * to check lock queue. */
-        if (put_pkt->target_win_handle != MPI_WIN_NULL) {
-            MPID_Win_get_ptr(put_pkt->target_win_handle, win_ptr);
-            mpi_errno =
-                MPIDI_CH3_Finish_rma_op_target(NULL, win_ptr, TRUE, put_pkt->flags, MPI_WIN_NULL);
-            if (mpi_errno) {
-                MPIU_ERR_POP(mpi_errno);
-            }
-        }
-        *buflen = sizeof(MPIDI_CH3_Pkt_t);
-        *rreqp = NULL;
-        goto fn_exit;
-    }
-
     MPIU_Assert(put_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(put_pkt->target_win_handle, win_ptr);
     mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, put_pkt->flags);
@@ -757,6 +739,13 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             MPID_Request_release(req);
     }
 
+    if (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER) {
+        win_ptr->at_completion_counter--;
+        MPIU_Assert(win_ptr->at_completion_counter >= 0);
+        /* Signal the local process when the op counter reaches 0. */
+        if (win_ptr->at_completion_counter == 0)
+            MPIDI_CH3_Progress_signal_completion();
+    }
 
     /* There are additional steps to take if this is a passive
      * target RMA or the last operation from the source */
@@ -1605,6 +1594,38 @@ int MPIDI_CH3_PktHandler_FlushAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
 
 #undef FUNCNAME
+#define FUNCNAME MPIDI_CH3_PktHandler_DecrAtCnt
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3_PktHandler_DecrAtCnt(MPIDI_VC_t * vc ATTRIBUTE((unused)),
+                                   MPIDI_CH3_Pkt_t * pkt,
+                                   MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
+{
+    MPIDI_CH3_Pkt_decr_at_counter_t *decr_at_cnt_pkt = &pkt->decr_at_cnt;
+    MPID_Win *win_ptr;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_DECRATCNT);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_DECRATCNT);
+
+    MPID_Win_get_ptr(decr_at_cnt_pkt->target_win_handle, win_ptr);
+
+    win_ptr->at_completion_counter--;
+    MPIU_Assert(win_ptr->at_completion_counter >= 0);
+
+    *buflen = sizeof(MPIDI_CH3_Pkt_t);
+    *rreqp = NULL;
+    MPIDI_CH3_Progress_signal_completion();
+
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_DECRATCNT);
+    return mpi_errno;
+   fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
 #define FUNCNAME MPIDI_CH3_PktHandler_Unlock
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)

http://git.mpich.org/mpich.git/commitdiff/6578785d10e4e10fb0a46cf3f1f7ba731112f591

commit 6578785d10e4e10fb0a46cf3f1f7ba731112f591
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sat Nov 1 19:02:21 2014 -0500

    Detect remote completion by FLUSH / FLUSH_ACK messages.
    
    When the origin wants to do a FLUSH sync, if there are
    active-message operations that are going to be issued,
    we piggback the FLUSH message with the last operation;
    if no such operations, we just send a single FLUSH packet.
    
    If the last operation is a write op (PUT, ACC) or only
    a single FLUSH packet is sent, after target recieves it,
    target will send back a single FLUSH_ACK packet;
    if the last operation contains a read action (GET, GACC, FOP,
    CAS), after target receiveds it, target will piggback a
    FLUSH_ACK flag with the response packet.
    
    After origin receives the FLUSH_ACK packet or response packet
    with FLUSH_ACK flag, it will decrement the counter which
    indicates number of outgoing sync messages (FLUSH / UNLOCK).
    When that counter reaches zero, origin can know that remote
    completion is achieved.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index dec6fa9..71a0884 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -124,7 +124,8 @@ typedef enum {
     MPIDI_CH3_PKT_FLAG_RMA_AT_COMPLETE = 16,
     MPIDI_CH3_PKT_FLAG_RMA_NOCHECK = 32,
     MPIDI_CH3_PKT_FLAG_RMA_SHARED = 64,
-    MPIDI_CH3_PKT_FLAG_RMA_EXCLUSIVE = 128
+    MPIDI_CH3_PKT_FLAG_RMA_EXCLUSIVE = 128,
+    MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK = 256
 } MPIDI_CH3_Pkt_flags_t;
 
 typedef struct MPIDI_CH3_Pkt_send {
@@ -268,6 +269,10 @@ typedef struct MPIDI_CH3_Pkt_get {
 typedef struct MPIDI_CH3_Pkt_get_resp {
     MPIDI_CH3_Pkt_type_t type;
     MPI_Request request_handle;
+    /* followings are used to decrement ack_counter at origin */
+    int target_rank;
+    MPI_Win source_win_handle;
+    MPIDI_CH3_Pkt_flags_t flags;
 } MPIDI_CH3_Pkt_get_resp_t;
 
 typedef struct MPIDI_CH3_Pkt_accum {
@@ -308,6 +313,10 @@ typedef struct MPIDI_CH3_Pkt_get_accum {
 typedef struct MPIDI_CH3_Pkt_get_accum_resp {
     MPIDI_CH3_Pkt_type_t type;
     MPI_Request request_handle;
+    /* followings are used to decrement ack_counter at origin */
+    int target_rank;
+    MPI_Win source_win_handle;
+    MPIDI_CH3_Pkt_flags_t flags;
 } MPIDI_CH3_Pkt_get_accum_resp_t;
 
 typedef struct MPIDI_CH3_Pkt_accum_immed {
@@ -348,6 +357,10 @@ typedef struct MPIDI_CH3_Pkt_cas_resp {
     MPIDI_CH3_Pkt_type_t type;
     MPI_Request request_handle;
     MPIDI_CH3_CAS_Immed_u data;
+    /* followings are used to decrement ack_counter at orign */
+    int target_rank;
+    MPI_Win source_win_handle;
+    MPIDI_CH3_Pkt_flags_t flags;
 } MPIDI_CH3_Pkt_cas_resp_t;
 
 typedef struct MPIDI_CH3_Pkt_fop {
@@ -369,6 +382,10 @@ typedef struct MPIDI_CH3_Pkt_fop_resp {
     MPIDI_CH3_Pkt_type_t type;
     MPI_Request request_handle;
     int data[MPIDI_RMA_FOP_RESP_IMMED_INTS];
+    /* followings are used to decrement ack_counter at orign */
+    int target_rank;
+    MPI_Win source_win_handle;
+    MPIDI_CH3_Pkt_flags_t flags;
 } MPIDI_CH3_Pkt_fop_resp_t;
 
 typedef struct MPIDI_CH3_Pkt_lock {
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index 00664ba..b534fb3 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -227,6 +227,34 @@ static inline int acquire_local_lock(MPID_Win * win_ptr, int lock_type)
 
 
 #undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_RMA_Handle_flush_ack
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_RMA_Handle_flush_ack(MPID_Win * win_ptr, int target_rank)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_RMA_Target_t *t;
+
+    mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, target_rank, &t);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    if (t == NULL) {
+        win_ptr->outstanding_unlocks--;
+        MPIU_Assert(win_ptr->outstanding_unlocks >= 0);
+    }
+    else {
+        t->sync.outstanding_acks--;
+        MPIU_Assert(t->sync.outstanding_acks >= 0);
+    }
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
 #define FUNCNAME do_accumulate_op
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 1c8ed6d..41bea08 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -88,6 +88,12 @@ int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *vc,
 
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) {
+        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, rreq->dev.source_win_handle);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        MPIDI_CH3_Progress_signal_completion();
+    }
+
     mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
                                                rreq->dev.source_win_handle);
     if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
@@ -134,6 +140,12 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *vc,
         MPIU_ERR_POP(mpi_errno);
     }
 
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) {
+        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, rreq->dev.source_win_handle);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        MPIDI_CH3_Progress_signal_completion();
+    }
+
     mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
                                                rreq->dev.source_win_handle);
     if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
@@ -178,6 +190,9 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
     get_accum_resp_pkt->request_handle = rreq->dev.resp_request_handle;
     get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
     get_accum_resp_pkt->source_win_handle = rreq->dev.source_win_handle;
+    get_accum_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
+        get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
 
     MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
 
@@ -459,10 +474,13 @@ int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *vc,
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_get_resp_t * get_resp_pkt = &upkt.get_resp;
     MPID_Request * sreq;
+    MPID_Win *win_ptr;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
     
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
                 
+    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
+
     /* create derived datatype */
     create_derived_datatype(rreq, &new_dtp);
     MPIU_Free(rreq->dev.dtype_info);
@@ -485,6 +503,11 @@ int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *vc,
     
     MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
     get_resp_pkt->request_handle = rreq->dev.request_handle;    
+    get_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
+    get_resp_pkt->source_win_handle = rreq->dev.source_win_handle;
+    get_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
+        get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
     
     sreq->dev.segment_ptr = MPID_Segment_alloc( );
     MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
@@ -628,6 +651,11 @@ int MPIDI_CH3_ReqHandler_FOPComplete( MPIDI_VC_t *vc,
     fop_resp_pkt->request_handle = rreq->dev.request_handle;
 
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
+    fop_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
+    fop_resp_pkt->source_win_handle = rreq->dev.source_win_handle;
+    fop_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+    if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
+        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
 
     /* Copy original data into the send buffer.  If data will fit in the
        header, use that.  Otherwise allocate a temporary buffer.  */
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 97d2f10..89fa944 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -219,6 +219,11 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
         MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
         get_resp_pkt->request_handle = get_pkt->request_handle;
+        get_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+        if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
+            get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+        get_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
+        get_resp_pkt->source_win_handle = get_pkt->source_win_handle;
 
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
@@ -703,6 +708,11 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIDI_Pkt_init(cas_resp_pkt, MPIDI_CH3_PKT_CAS_RESP);
     cas_resp_pkt->request_handle = cas_pkt->request_handle;
+    cas_resp_pkt->source_win_handle = cas_pkt->source_win_handle;
+    cas_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
+    cas_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+    if (cas_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
+        cas_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
 
     /* Copy old value into the response packet */
     MPID_Datatype_get_size_macro(cas_pkt->datatype, len);
@@ -777,12 +787,22 @@ int MPIDI_CH3_PktHandler_CASResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPIDI_CH3_Pkt_cas_resp_t *cas_resp_pkt = &pkt->cas_resp;
     MPID_Request *req;
     MPI_Aint len;
+    MPID_Win *win_ptr;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_CASRESP);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_CASRESP);
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received CAS response pkt");
 
+    MPID_Win_get_ptr(cas_resp_pkt->source_win_handle, win_ptr);
+
+    /* decrement ack_counter on this target */
+    if (cas_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
+        int target_rank = cas_resp_pkt->target_rank;
+        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+
     MPID_Request_get_ptr(cas_resp_pkt->request_handle, req);
     MPID_Datatype_get_size_macro(req->dev.datatype, len);
 
@@ -837,6 +857,10 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     req->dev.target_win_handle = fop_pkt->target_win_handle;
     req->dev.request_handle = fop_pkt->request_handle;
     req->dev.flags = fop_pkt->flags;
+    /* fop_pkt->source_win_handle is set in MPIDI_Fetch_and_op,
+       here we pass it to receiving request, so that after receiving
+       is finished, we can pass it to sending back pkt. */
+    req->dev.source_win_handle = fop_pkt->source_win_handle;
 
     MPID_Datatype_get_size_macro(req->dev.datatype, len);
     MPIU_Assert(len <= sizeof(MPIDI_CH3_FOP_Immed_u));
@@ -906,12 +930,22 @@ int MPIDI_CH3_PktHandler_FOPResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPID_Request *req;
     int complete = 0;
     MPI_Aint len;
+    MPID_Win *win_ptr;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOPRESP);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_FOPRESP);
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received FOP response pkt");
 
+    MPID_Win_get_ptr(fop_resp_pkt->source_win_handle, win_ptr);
+
+    /* decrement ack_counter */
+    if (fop_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
+        int target_rank = fop_resp_pkt->target_rank;
+        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+
     MPID_Request_get_ptr(fop_resp_pkt->request_handle, req);
     MPID_Datatype_get_size_macro(req->dev.datatype, len);
 
@@ -967,12 +1001,22 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPIDI_msg_sz_t data_len;
     int mpi_errno = MPI_SUCCESS;
     MPI_Aint type_size;
+    MPID_Win *win_ptr;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_GET_ACCUM_RESP);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_GET_ACCUM_RESP);
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received Get-Accumulate response pkt");
 
+    MPID_Win_get_ptr(get_accum_resp_pkt->source_win_handle, win_ptr);
+
+    /* decrement ack_counter on target */
+    if (get_accum_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
+        int target_rank = get_accum_resp_pkt->target_rank;
+        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+
     data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
     data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
@@ -1444,12 +1488,22 @@ int MPIDI_CH3_PktHandler_GetResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPIDI_msg_sz_t data_len;
     int mpi_errno = MPI_SUCCESS;
     MPI_Aint type_size;
+    MPID_Win *win_ptr;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_GETRESP);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_GETRESP);
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received get response pkt");
 
+    MPID_Win_get_ptr(get_resp_pkt->source_win_handle, win_ptr);
+
+    /* decrement ack_counter on target */
+    if (get_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
+        int target_rank = get_resp_pkt->target_rank;
+        mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    }
+
     data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
     data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
@@ -1513,6 +1567,8 @@ int MPIDI_CH3_PktHandler_FlushAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 {
     MPIDI_CH3_Pkt_flush_ack_t *flush_ack_pkt = &pkt->flush_ack;
     MPID_Win *win_ptr = NULL;
+    int mpi_errno = MPI_SUCCESS;
+    int target_rank = flush_ack_pkt->target_rank;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_FLUSHACK);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_FLUSHACK);
@@ -1522,6 +1578,11 @@ int MPIDI_CH3_PktHandler_FlushAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
 
     MPID_Win_get_ptr(flush_ack_pkt->source_win_handle, win_ptr);
+
+    /* decrement ack_counter on target */
+    mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
     MPIU_Assert(win_ptr->targets[flush_ack_pkt->target_rank].remote_lock_state !=
                 MPIDI_CH3_WIN_LOCK_NONE);
 
@@ -1538,6 +1599,8 @@ int MPIDI_CH3_PktHandler_FlushAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_FLUSHACK);
  fn_exit:
     return MPI_SUCCESS;
+ fn_fail:
+    goto fn_exit;
 }
 
 
@@ -1594,6 +1657,11 @@ int MPIDI_CH3_PktHandler_Flush(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
     *rreqp = NULL;
 
+    MPID_Win_get_ptr(flush_pkt->target_win_handle, win_ptr);
+
+    mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, flush_pkt->source_win_handle);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
     /* This is a flush request packet */
     if (flush_pkt->target_win_handle != MPI_WIN_NULL) {
         MPID_Request *req = NULL;

http://git.mpich.org/mpich.git/commitdiff/fe15ea266ef2e8a2d53565b88ab3ce4d4b9e02b6

commit fe15ea266ef2e8a2d53565b88ab3ce4d4b9e02b6
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Oct 28 10:50:54 2014 -0500

    Separate request handler of PUT, ACC, GACC and rename them.
    
    Separate final request handler of PUT, ACC, GACC into three.
    Separate derived DT request handler of ACC and GACC into two.
    
    Renaming request handlers as follows:
    
    (1) Normal request handler: it is triggered on target side
        when all data from origin is received.
    
        It includes:
    
        ReqHandler_PutRecvComplete --- for PUT
        ReqHandler_AccumRecvComplete --- for ACC
        ReqHandler_GaccumRecvComplete --- for GACC
    
    (2) Derived DT request handler: it is triggered on target
        side when all derived DT info is recieved.
    
        It includes:
    
        ReqHandler_PutDerivedDTRecvComplete --- for PUT
        ReqHandler_AccumDerivedDTRecvComplete --- for ACC
        ReqHandler_GaccumDerivedDTRecvComplete --- for GACC
    
    (3) Reponse request handler: it is triggered on target
        side when sending back process is finished in GET-like
        operations.
    
        It includes:
    
        ReqHandler_GetSendComplete --- for GET
        ReqHandler_GaccumLikeSendComplete --- for GACC, FOP, CAS
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index dbb02c9..9eafc3c 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -1889,27 +1889,33 @@ int MPIDI_CH3_ReqHandler_UnpackSRBufReloadIOV( MPIDI_VC_t *, MPID_Request *,
 					       int * );
 int MPIDI_CH3_ReqHandler_UnpackSRBufComplete( MPIDI_VC_t *, MPID_Request *,
 					      int * );
-int MPIDI_CH3_ReqHandler_PutRespDerivedDTComplete( MPIDI_VC_t *, 
+int MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete( MPIDI_VC_t *,
 						   MPID_Request *, int * );
-int MPIDI_CH3_ReqHandler_PutAccumRespComplete( MPIDI_VC_t *, MPID_Request *,
-					       int * );
-int MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete( MPIDI_VC_t *, 
+int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *, MPID_Request *,
+                                          int * );
+int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *, MPID_Request *,
+                                            int * );
+int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *, MPID_Request *,
+                                             int * );
+int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete( MPIDI_VC_t *,
 						     MPID_Request *,
 						     int * );
-int MPIDI_CH3_ReqHandler_GetAccumRespComplete( MPIDI_VC_t *vc, 
-                                               MPID_Request *rreq, 
-                                               int *complete );
+int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete( MPIDI_VC_t *,
+                                                      MPID_Request *,
+                                                      int * );
 int MPIDI_CH3_ReqHandler_SinglePutAccumComplete( MPIDI_VC_t *, MPID_Request *,
 						 int * );
-int MPIDI_CH3_ReqHandler_GetRespDerivedDTComplete( MPIDI_VC_t *, 
+int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *,
 						   MPID_Request *, int * );
 int MPIDI_CH3_ReqHandler_FOPComplete( MPIDI_VC_t *, MPID_Request *, int * );
 
 /* Send Handlers */
 int MPIDI_CH3_ReqHandler_SendReloadIOV( MPIDI_VC_t *vc, MPID_Request *sreq, 
 					int *complete );
-int MPIDI_CH3_ReqHandler_GetSendRespComplete( MPIDI_VC_t *, MPID_Request *,
-					      int * );
+int MPIDI_CH3_ReqHandler_GetSendComplete( MPIDI_VC_t *, MPID_Request *,
+                                          int * );
+int MPIDI_CH3_ReqHandler_GaccumLikeSendComplete( MPIDI_VC_t *, MPID_Request *,
+                                                 int * );
 /* Thread Support */
 #ifdef MPICH_IS_THREADED
 #if MPIU_THREAD_GRANULARITY == MPIU_THREAD_GRANULARITY_GLOBAL
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index d82fb53..1c8ed6d 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -73,110 +73,177 @@ int MPIDI_CH3_ReqHandler_RecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
 }
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_ReqHandler_PutAccumRespComplete
+#define FUNCNAME MPIDI_CH3_ReqHandler_PutRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_PutAccumRespComplete( MPIDI_VC_t *vc, 
-					       MPID_Request *rreq, 
-					       int *complete )
+int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *vc,
+                                          MPID_Request *rreq,
+                                          int *complete )
 {
     int mpi_errno = MPI_SUCCESS;
-    int get_acc_flag = 0;
     MPID_Win *win_ptr;
-    MPIU_CHKPMEM_DECL(1);
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTACCUMRESPCOMPLETE);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRECVCOMPLETE);
 
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTACCUMRESPCOMPLETE);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRECVCOMPLETE);
 
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
-    /* Perform get in get-accumulate */
-    if (rreq->dev.resp_request_handle != MPI_REQUEST_NULL) {
-        MPI_Aint type_size;
-        MPIDI_CH3_Pkt_t upkt;
-        MPIDI_CH3_Pkt_get_accum_resp_t *get_accum_resp_pkt = &upkt.get_accum_resp;
-        MPID_Request *resp_req;
-        MPID_IOV iov[MPID_IOV_LIMIT];
+    mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
+                                               rreq->dev.source_win_handle);
+    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
 
-        MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
-        get_accum_resp_pkt->request_handle = rreq->dev.resp_request_handle;
+    /* mark data transfer as complete and decrement CC */
+    MPIDI_CH3U_Request_complete(rreq);
+    *complete = TRUE;
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRECVCOMPLETE);
+    return MPI_SUCCESS;
 
-        MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
+    /* --BEGIN ERROR HANDLING-- */
+ fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
 
-        /* Copy data into a temporary buffer */
-        resp_req = MPID_Request_create();
-        MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
-        MPIU_Object_set_ref(resp_req, 1);
 
-        MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, rreq->dev.user_count * type_size,
-                            mpi_errno, "GACC resp. buffer");
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3_ReqHandler_AccumRecvComplete
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *vc,
+                                            MPID_Request *rreq,
+                                            int *complete )
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPID_Win *win_ptr;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
 
-        if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype)) {
-            MPIU_Memcpy(resp_req->dev.user_buf, rreq->dev.real_user_buf, 
-                        rreq->dev.user_count * type_size);
-        } else {
-            MPID_Segment *seg = MPID_Segment_alloc();
-            MPI_Aint last = type_size * rreq->dev.user_count;
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
 
-            MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment");
-            MPID_Segment_init(rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype, seg, 0);
-            MPID_Segment_pack(seg, 0, &last, resp_req->dev.user_buf);
-            MPID_Segment_free(seg);
-        }
+    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
-        resp_req->dev.OnFinal = MPIDI_CH3_ReqHandler_GetAccumRespComplete;
-        resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetAccumRespComplete;
-        resp_req->dev.target_win_handle = rreq->dev.target_win_handle;
-        resp_req->dev.flags = rreq->dev.flags;
+    MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP);
 
-        /* here we increment the Active Target counter to guarantee the GET-like
-           operation are completed when counter reaches zero. */
-        win_ptr->at_completion_counter++;
+    if (win_ptr->shm_allocated == TRUE)
+        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+    /* accumulate data from tmp_buf into user_buf */
+    mpi_errno = do_accumulate_op(rreq);
+    if (win_ptr->shm_allocated == TRUE)
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+    if (mpi_errno) {
+        MPIU_ERR_POP(mpi_errno);
+    }
+
+    mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
+                                               rreq->dev.source_win_handle);
+    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+
+    /* mark data transfer as complete and decrement CC */
+    MPIDI_CH3U_Request_complete(rreq);
+    *complete = TRUE;
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
+    return MPI_SUCCESS;
 
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
+    /* --BEGIN ERROR HANDLING-- */
+ fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
 
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)resp_req->dev.user_buf;
-        iov[1].MPID_IOV_LEN = type_size*rreq->dev.user_count;
 
-        MPIU_THREAD_CS_ENTER(CH3COMM,vc);
-        mpi_errno = MPIDI_CH3_iSendv(vc, resp_req, iov, 2);
-        MPIU_THREAD_CS_EXIT(CH3COMM,vc);
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3_ReqHandler_GaccumRecvComplete
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
+                                             MPID_Request *rreq,
+                                             int *complete )
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPID_Win *win_ptr;
+    MPI_Aint type_size;
+    MPIDI_CH3_Pkt_t upkt;
+    MPIDI_CH3_Pkt_get_accum_resp_t *get_accum_resp_pkt = &upkt.get_accum_resp;
+    MPID_Request *resp_req;
+    MPID_IOV iov[MPID_IOV_LIMIT];
+    MPIU_CHKPMEM_DECL(1);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);
 
-        MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);
+
+    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
+
+    MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
+    get_accum_resp_pkt->request_handle = rreq->dev.resp_request_handle;
+    get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
+    get_accum_resp_pkt->source_win_handle = rreq->dev.source_win_handle;
 
-        /* Mark get portion as handled */
-        rreq->dev.resp_request_handle = MPI_REQUEST_NULL;
+    MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
 
-        get_acc_flag = 1;
+    /* Copy data into a temporary buffer */
+    resp_req = MPID_Request_create();
+    MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+    MPIU_Object_set_ref(resp_req, 1);
+
+    MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, rreq->dev.user_count * type_size,
+                        mpi_errno, "GACC resp. buffer");
+
+    if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype)) {
+        MPIU_Memcpy(resp_req->dev.user_buf, rreq->dev.real_user_buf,
+                    rreq->dev.user_count * type_size);
+    } else {
+        MPID_Segment *seg = MPID_Segment_alloc();
+        MPI_Aint last = type_size * rreq->dev.user_count;
+
+        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment");
+        MPID_Segment_init(rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype, seg, 0);
+        MPID_Segment_pack(seg, 0, &last, resp_req->dev.user_buf);
+        MPID_Segment_free(seg);
     }
 
-    if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP) {
+    resp_req->dev.OnFinal = MPIDI_CH3_ReqHandler_GaccumLikeSendComplete;
+    resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumLikeSendComplete;
+    resp_req->dev.target_win_handle = rreq->dev.target_win_handle;
+    resp_req->dev.flags = rreq->dev.flags;
 
-	if (win_ptr->shm_allocated == TRUE)
-	    MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-	/* accumulate data from tmp_buf into user_buf */
-	mpi_errno = do_accumulate_op(rreq);
-	if (win_ptr->shm_allocated == TRUE)
-	    MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+    /* here we increment the Active Target counter to guarantee the GET-like
+       operation are completed when counter reaches zero. */
+    win_ptr->at_completion_counter++;
 
-	if (mpi_errno) {
-	    MPIU_ERR_POP(mpi_errno);
-	}
+    iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
+    iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
+
+    iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)resp_req->dev.user_buf;
+    iov[1].MPID_IOV_LEN = type_size*rreq->dev.user_count;
+
+    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
+    mpi_errno = MPIDI_CH3_iSendv(vc, resp_req, iov, 2);
+    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
+
+    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+    /* Mark get portion as handled */
+    rreq->dev.resp_request_handle = MPI_REQUEST_NULL;
+
+    MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
+
+    if (win_ptr->shm_allocated == TRUE)
+        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+    /* accumulate data from tmp_buf into user_buf */
+    mpi_errno = do_accumulate_op(rreq);
+    if (win_ptr->shm_allocated == TRUE)
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+    if (mpi_errno) {
+        MPIU_ERR_POP(mpi_errno);
     }
     
-    if (!get_acc_flag) {
-    mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
-                                               rreq->dev.source_win_handle);
-    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-    }
-
     /* mark data transfer as complete and decrement CC */
     MPIDI_CH3U_Request_complete(rreq);
     *complete = TRUE;
  fn_exit:
     MPIU_CHKPMEM_COMMIT();
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTACCUMRESPCOMPLETE);
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);
     return MPI_SUCCESS;
 
     /* --BEGIN ERROR HANDLING-- */
@@ -187,18 +254,18 @@ int MPIDI_CH3_ReqHandler_PutAccumRespComplete( MPIDI_VC_t *vc,
 }
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_ReqHandler_PutRespDerivedDTComplete
+#define FUNCNAME MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_PutRespDerivedDTComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
+int MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
 						   MPID_Request *rreq, 
 						   int *complete )
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRESPDERIVEDDTCOMPLETE);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
     
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRESPDERIVEDDTCOMPLETE);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
                 
     /* create derived datatype */
     create_derived_datatype(rreq, &new_dtp);
@@ -229,19 +296,19 @@ int MPIDI_CH3_ReqHandler_PutRespDerivedDTComplete( MPIDI_VC_t *vc ATTRIBUTE((unu
 			    "**ch3|loadrecviov");
     }
     if (!rreq->dev.OnDataAvail) 
-	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutAccumRespComplete;
+	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutRecvComplete;
     
     *complete = FALSE;
  fn_fail:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRESPDERIVEDDTCOMPLETE);
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
     return mpi_errno;
 }
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete
+#define FUNCNAME MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
+int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
 						     MPID_Request *rreq, 
 						     int *complete )
 {
@@ -249,9 +316,9 @@ int MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete( MPIDI_VC_t *vc ATTRIBUTE((u
     MPID_Datatype *new_dtp = NULL;
     MPI_Aint true_lb, true_extent, extent;
     void *tmp_buf;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRESPDERIVEDDTCOMPLETE);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
     
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRESPDERIVEDDTCOMPLETE);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
     
     /* create derived datatype */
     create_derived_datatype(rreq, &new_dtp);
@@ -299,58 +366,91 @@ int MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete( MPIDI_VC_t *vc ATTRIBUTE((u
 			    "**ch3|loadrecviov");
     }
     if (!rreq->dev.OnDataAvail)
-	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutAccumRespComplete;
+	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumRecvComplete;
     
     *complete = FALSE;
  fn_fail:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRESPDERIVEDDTCOMPLETE);
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
     return mpi_errno;
 }
 
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_ReqHandler_GetAccumRespComplete
+#define FUNCNAME MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_GetAccumRespComplete( MPIDI_VC_t *vc, 
-                                               MPID_Request *rreq, 
-                                               int *complete )
+int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
+                                                      MPID_Request *rreq,
+                                                      int *complete )
 {
     int mpi_errno = MPI_SUCCESS;
-    MPID_Win *win_ptr;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GETACCUMRESPCOMPLETE);
-    
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GETACCUMRESPCOMPLETE);
-    if (rreq->dev.user_buf != NULL)
-        MPIU_Free(rreq->dev.user_buf);
+    MPID_Datatype *new_dtp = NULL;
+    MPI_Aint true_lb, true_extent, extent;
+    void *tmp_buf;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
 
-    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
 
-    mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
-                                               MPI_WIN_NULL);
-    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+    /* create derived datatype */
+    create_derived_datatype(rreq, &new_dtp);
 
-    /* here we decrement the Active Target counter to guarantee the GET-like
-       operation are completed when counter reaches zero. */
-    win_ptr->at_completion_counter--;
-    MPIU_Assert(win_ptr->at_completion_counter >= 0);
+    /* update new request to get the data */
+    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
 
-    MPIDI_CH3U_Request_complete(rreq);
-    *complete = TRUE;
- fn_exit:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GETACCUMRESPCOMPLETE);
-    return mpi_errno;
+    /* first need to allocate tmp_buf to recv the data into */
+
+    MPIR_Type_get_true_extent_impl(new_dtp->handle, &true_lb, &true_extent);
+    MPID_Datatype_get_extent_macro(new_dtp->handle, extent);
 
+    tmp_buf = MPIU_Malloc(rreq->dev.user_count *
+			  (MPIR_MAX(extent,true_extent)));
+    if (!tmp_buf) {
+	MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %d",
+		    rreq->dev.user_count * MPIR_MAX(extent,true_extent));
+    }
+
+    /* adjust for potential negative lower bound in datatype */
+    tmp_buf = (void *)((char*)tmp_buf - true_lb);
+
+    rreq->dev.user_buf = tmp_buf;
+    rreq->dev.datatype = new_dtp->handle;
+    rreq->dev.recv_data_sz = new_dtp->size *
+	rreq->dev.user_count;
+    rreq->dev.datatype_ptr = new_dtp;
+    /* this will cause the datatype to be freed when the
+       request is freed. free dtype_info here. */
+    MPIU_Free(rreq->dev.dtype_info);
+
+    rreq->dev.segment_ptr = MPID_Segment_alloc( );
+    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
+
+    MPID_Segment_init(rreq->dev.user_buf,
+		      rreq->dev.user_count,
+		      rreq->dev.datatype,
+		      rreq->dev.segment_ptr, 0);
+    rreq->dev.segment_first = 0;
+    rreq->dev.segment_size = rreq->dev.recv_data_sz;
+
+    mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
+    if (mpi_errno != MPI_SUCCESS) {
+	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
+			    "**ch3|loadrecviov");
+    }
+    if (!rreq->dev.OnDataAvail)
+	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumRecvComplete;
+
+    *complete = FALSE;
  fn_fail:
-    goto fn_exit;
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
+    return mpi_errno;
 }
 
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3_ReqHandler_GetRespDerivedDTComplete
+#define FUNCNAME MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_GetRespDerivedDTComplete( MPIDI_VC_t *vc, 
+int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *vc,
 						   MPID_Request *rreq, 
 						   int *complete )
 {
@@ -359,9 +459,9 @@ int MPIDI_CH3_ReqHandler_GetRespDerivedDTComplete( MPIDI_VC_t *vc,
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_get_resp_t * get_resp_pkt = &upkt.get_resp;
     MPID_Request * sreq;
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GETRESPDERIVEDDTCOMPLETE);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
     
-    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GETRESPDERIVEDDTCOMPLETE);
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
                 
     /* create derived datatype */
     create_derived_datatype(rreq, &new_dtp);
@@ -373,8 +473,8 @@ int MPIDI_CH3_ReqHandler_GetRespDerivedDTComplete( MPIDI_VC_t *vc,
     
     sreq->kind = MPID_REQUEST_SEND;
     MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_GET_RESP);
-    sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetSendRespComplete;
-    sreq->dev.OnFinal     = MPIDI_CH3_ReqHandler_GetSendRespComplete;
+    sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetSendComplete;
+    sreq->dev.OnFinal     = MPIDI_CH3_ReqHandler_GetSendComplete;
     sreq->dev.user_buf = rreq->dev.user_buf;
     sreq->dev.user_count = rreq->dev.user_count;
     sreq->dev.datatype = new_dtp->handle;
@@ -415,7 +515,7 @@ int MPIDI_CH3_ReqHandler_GetRespDerivedDTComplete( MPIDI_VC_t *vc,
     *complete = TRUE;
     
  fn_fail:
-    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GETRESPDERIVEDDTCOMPLETE);
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
     return mpi_errno;
 }
 
@@ -541,7 +641,7 @@ int MPIDI_CH3_ReqHandler_FOPComplete( MPIDI_VC_t *vc,
 
         resp_req->dev.target_win_handle = rreq->dev.target_win_handle;
         resp_req->dev.flags = rreq->dev.flags;
-        resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetAccumRespComplete;
+        resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumLikeSendComplete;
 
         /* here we increment the Active Target counter to guarantee the GET-like
            operation are completed when counter reaches zero. */
@@ -578,7 +678,7 @@ int MPIDI_CH3_ReqHandler_FOPComplete( MPIDI_VC_t *vc,
                    (it is initialized to NULL by lower layer) */
                 resp_req->dev.target_win_handle = rreq->dev.target_win_handle;
                 resp_req->dev.flags = rreq->dev.flags;
-                resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetAccumRespComplete;
+                resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumLikeSendComplete;
 
                 /* here we increment the Active Target counter to guarantee the GET-like
                    operation are completed when counter reaches zero. */
@@ -696,10 +796,19 @@ int MPIDI_CH3_ReqHandler_UnpackSRBufComplete( MPIDI_VC_t *vc,
 
     MPIDI_CH3U_Request_unpack_srbuf(rreq);
 
-    if ((MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_PUT_RESP) ||
-	(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP))
+    if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_PUT_RESP)
+    {
+	mpi_errno = MPIDI_CH3_ReqHandler_PutRecvComplete(
+	    vc, rreq, complete );
+    }
+    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP)
+    {
+	mpi_errno = MPIDI_CH3_ReqHandler_AccumRecvComplete(
+	    vc, rreq, complete );
+    }
+    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RESP)
     {
-	mpi_errno = MPIDI_CH3_ReqHandler_PutAccumRespComplete( 
+	mpi_errno = MPIDI_CH3_ReqHandler_GaccumRecvComplete(
 	    vc, rreq, complete );
     }
     else {
diff --git a/src/mpid/ch3/src/ch3u_handle_send_req.c b/src/mpid/ch3/src/ch3u_handle_send_req.c
index fd732bb..abe1052 100644
--- a/src/mpid/ch3/src/ch3u_handle_send_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_send_req.c
@@ -43,7 +43,7 @@ int MPIDI_CH3U_Handle_send_req(MPIDI_VC_t * vc, MPID_Request * sreq,
  */
 /* ----------------------------------------------------------------------- */
 
-int MPIDI_CH3_ReqHandler_GetSendRespComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
+int MPIDI_CH3_ReqHandler_GetSendComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
 					      MPID_Request *sreq, 
 					      int *complete )
 {
@@ -70,6 +70,42 @@ int MPIDI_CH3_ReqHandler_GetSendRespComplete( MPIDI_VC_t *vc ATTRIBUTE((unused))
     goto fn_exit;
 }
 
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3_ReqHandler_GaccumLikeSendComplete
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3_ReqHandler_GaccumLikeSendComplete( MPIDI_VC_t *vc,
+                                                 MPID_Request *rreq,
+                                                 int *complete )
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPID_Win *win_ptr;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMLIKESENDCOMPLETE);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMLIKESENDCOMPLETE);
+    /* This function is triggered when sending back process of GACC/FOP/CAS
+       is finished. Only GACC used user_buf. FOP and CAS can fit all data
+       in response packet. */
+    if (rreq->dev.user_buf != NULL)
+        MPIU_Free(rreq->dev.user_buf);
+
+    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
+
+    /* here we decrement the Active Target counter to guarantee the GET-like
+       operation are completed when counter reaches zero. */
+    win_ptr->at_completion_counter--;
+    MPIU_Assert(win_ptr->at_completion_counter >= 0);
+
+    MPIDI_CH3U_Request_complete(rreq);
+    *complete = TRUE;
+ fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMLIKESENDCOMPLETE);
+    return mpi_errno;
+
+ fn_fail:
+    goto fn_exit;
+}
+
 int MPIDI_CH3_ReqHandler_SendReloadIOV( MPIDI_VC_t *vc ATTRIBUTE((unused)), MPID_Request *sreq, 
 					int *complete )
 {
diff --git a/src/mpid/ch3/src/ch3u_request.c b/src/mpid/ch3/src/ch3u_request.c
index 7caa636..d7602f3 100644
--- a/src/mpid/ch3/src/ch3u_request.c
+++ b/src/mpid/ch3/src/ch3u_request.c
@@ -91,6 +91,7 @@ MPID_Request * MPID_Request_create(void)
         req->dev.user_buf          = NULL;
         req->dev.OnDataAvail       = NULL;
         req->dev.OnFinal           = NULL;
+        req->dev.user_buf          = NULL;
 #ifdef MPIDI_CH3_REQUEST_INIT
 	MPIDI_CH3_REQUEST_INIT(req);
 #endif
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 03e34c5..97d2f10 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -84,14 +84,14 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
          * post_data_receive reset the handler.  There should
          * be a cleaner way to do this */
         if (!req->dev.OnDataAvail) {
-            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutAccumRespComplete;
+            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutRecvComplete;
         }
 
         /* return the number of bytes processed in this function */
         *buflen = sizeof(MPIDI_CH3_Pkt_t) + data_len;
 
         if (complete) {
-            mpi_errno = MPIDI_CH3_ReqHandler_PutAccumRespComplete(vc, req, &complete);
+            mpi_errno = MPIDI_CH3_ReqHandler_PutRecvComplete(vc, req, &complete);
             if (mpi_errno)
                 MPIU_ERR_POP(mpi_errno);
             if (complete) {
@@ -104,7 +104,7 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         /* derived datatype */
         MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RESP_DERIVED_DT);
         req->dev.datatype = MPI_DATATYPE_NULL;
-        req->dev.OnFinal = MPIDI_CH3_ReqHandler_PutAccumRespComplete;
+        req->dev.OnFinal = MPIDI_CH3_ReqHandler_PutRecvComplete;
 
         req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
             MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
@@ -132,7 +132,7 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                 sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) + put_pkt->dataloop_size;
 
             /* All dtype data has been received, call req handler */
-            mpi_errno = MPIDI_CH3_ReqHandler_PutRespDerivedDTComplete(vc, req, &complete);
+            mpi_errno = MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete(vc, req, &complete);
             MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
                                  "**ch3|postrecv %s", "MPIDI_CH3_PKT_PUT");
             if (complete) {
@@ -149,7 +149,7 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
             *buflen = sizeof(MPIDI_CH3_Pkt_t);
 
-            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutRespDerivedDTComplete;
+            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete;
         }
 
     }
@@ -213,8 +213,8 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         MPIDI_CH3_Pkt_get_resp_t *get_resp_pkt = &upkt.get_resp;
 
         MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_RESP);
-        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetSendRespComplete;
-        req->dev.OnFinal = MPIDI_CH3_ReqHandler_GetSendRespComplete;
+        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetSendComplete;
+        req->dev.OnFinal = MPIDI_CH3_ReqHandler_GetSendComplete;
         req->kind = MPID_REQUEST_SEND;
 
         MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
@@ -245,7 +245,7 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         /* derived datatype. first get the dtype_info and dataloop. */
 
         MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_RESP_DERIVED_DT);
-        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetRespDerivedDTComplete;
+        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete;
         req->dev.OnFinal = 0;
         req->dev.user_buf = get_pkt->addr;
         req->dev.user_count = get_pkt->count;
@@ -278,7 +278,7 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                 sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) + get_pkt->dataloop_size;
 
             /* All dtype data has been received, call req handler */
-            mpi_errno = MPIDI_CH3_ReqHandler_GetRespDerivedDTComplete(vc, req, &complete);
+            mpi_errno = MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete(vc, req, &complete);
             MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
                                  "**ch3|postrecv %s", "MPIDI_CH3_PKT_GET");
             if (complete)
@@ -373,13 +373,13 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
          * post_data_receive reset the handler.  There should
          * be a cleaner way to do this */
         if (!req->dev.OnDataAvail) {
-            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutAccumRespComplete;
+            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumRecvComplete;
         }
         /* return the number of bytes processed in this function */
         *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
 
         if (complete) {
-            mpi_errno = MPIDI_CH3_ReqHandler_PutAccumRespComplete(vc, req, &complete);
+            mpi_errno = MPIDI_CH3_ReqHandler_AccumRecvComplete(vc, req, &complete);
             if (mpi_errno)
                 MPIU_ERR_POP(mpi_errno);
             if (complete) {
@@ -390,9 +390,9 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     }
     else {
         MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RESP_DERIVED_DT);
-        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete;
+        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete;
         req->dev.datatype = MPI_DATATYPE_NULL;
-        req->dev.OnFinal = MPIDI_CH3_ReqHandler_PutAccumRespComplete;
+        req->dev.OnFinal = MPIDI_CH3_ReqHandler_AccumRecvComplete;
 
         req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
             MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
@@ -417,7 +417,7 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                 sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) + accum_pkt->dataloop_size;
 
             /* All dtype data has been received, call req handler */
-            mpi_errno = MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete(vc, req, &complete);
+            mpi_errno = MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(vc, req, &complete);
             MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
                                  "**ch3|postrecv %s", "MPIDI_CH3_ACCUMULATE");
             if (complete) {
@@ -520,13 +520,13 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
          * post_data_receive reset the handler.  There should
          * be a cleaner way to do this */
         if (!req->dev.OnDataAvail) {
-            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutAccumRespComplete;
+            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumRecvComplete;
         }
         /* return the number of bytes processed in this function */
         *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
 
         if (complete) {
-            mpi_errno = MPIDI_CH3_ReqHandler_PutAccumRespComplete(vc, req, &complete);
+            mpi_errno = MPIDI_CH3_ReqHandler_GaccumRecvComplete(vc, req, &complete);
             if (mpi_errno)
                 MPIU_ERR_POP(mpi_errno);
             if (complete) {
@@ -537,9 +537,9 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     }
     else {
         MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP_DERIVED_DT);
-        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete;
+        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete;
         req->dev.datatype = MPI_DATATYPE_NULL;
-        req->dev.OnFinal = MPIDI_CH3_ReqHandler_PutAccumRespComplete;
+        req->dev.OnFinal = MPIDI_CH3_ReqHandler_GaccumRecvComplete;
 
         req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
             MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
@@ -564,7 +564,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                 sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) + get_accum_pkt->dataloop_size;
 
             /* All dtype data has been received, call req handler */
-            mpi_errno = MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete(vc, req, &complete);
+            mpi_errno = MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(vc, req, &complete);
             MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
                                  "**ch3|postrecv %s", "MPIDI_CH3_ACCUMULATE");
             if (complete) {
@@ -734,7 +734,7 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                (it is initialized to NULL by lower layer) */
             req->dev.target_win_handle = cas_pkt->target_win_handle;
             req->dev.flags = cas_pkt->flags;
-            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetAccumRespComplete;
+            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumLikeSendComplete;
 
             /* here we increment the Active Target counter to guarantee the GET-like
                operation are completed when counter reaches zero. */

http://git.mpich.org/mpich.git/commitdiff/c0094faadcede19c63a6d700e9230ead67780f06

commit c0094faadcede19c63a6d700e9230ead67780f06
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sat Nov 1 19:00:27 2014 -0500

    Split shared RMA packet structures.
    
    Previously several RMA packet types share the same structure,
    which is misleading for coding. Here make different
    RMA packet types use different packet data structures.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index caf1f47..dec6fa9 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -204,9 +204,11 @@ MPIDI_CH3_PKT_DEFS
             datatype_ = pkt_.get.datatype;                              \
             break;                                                      \
         case (MPIDI_CH3_PKT_ACCUMULATE):                                \
-        case (MPIDI_CH3_PKT_GET_ACCUM):                                 \
             datatype_ = pkt_.accum.datatype;                            \
             break;                                                      \
+        case (MPIDI_CH3_PKT_GET_ACCUM):                                 \
+            datatype_ = pkt_.get_accum.datatype;                        \
+            break;                                                      \
         case (MPIDI_CH3_PKT_CAS):                                       \
             datatype_ = pkt_.cas.datatype;                              \
             break;                                                      \
@@ -271,7 +273,6 @@ typedef struct MPIDI_CH3_Pkt_get_resp {
 typedef struct MPIDI_CH3_Pkt_accum {
     MPIDI_CH3_Pkt_type_t type;
     MPIDI_CH3_Pkt_flags_t flags;
-    MPI_Request request_handle; /* For get_accumulate response */
     void *addr;
     int count;
     MPI_Datatype datatype;
@@ -286,6 +287,24 @@ typedef struct MPIDI_CH3_Pkt_accum {
                                  * with shared locks. Otherwise set to NULL*/
 } MPIDI_CH3_Pkt_accum_t;
 
+typedef struct MPIDI_CH3_Pkt_get_accum {
+    MPIDI_CH3_Pkt_type_t type;
+    MPIDI_CH3_Pkt_flags_t flags;
+    MPI_Request request_handle; /* For get_accumulate response */
+    void *addr;
+    int count;
+    MPI_Datatype datatype;
+    int dataloop_size;          /* for derived datatypes */
+    MPI_Op op;
+    MPI_Win target_win_handle;  /* Used in the last RMA operation in each
+                                 * epoch for decrementing rma op counter in
+                                 * active target rma and for unlocking window
+                                 * in passive target rma. Otherwise set to NULL*/
+    MPI_Win source_win_handle;  /* Used in the last RMA operation in an
+                                 * epoch in the case of passive target rma
+                                 * with shared locks. Otherwise set to NULL*/
+} MPIDI_CH3_Pkt_get_accum_t;
+
 typedef struct MPIDI_CH3_Pkt_get_accum_resp {
     MPIDI_CH3_Pkt_type_t type;
     MPI_Request request_handle;
@@ -362,6 +381,26 @@ typedef struct MPIDI_CH3_Pkt_lock {
     int origin_rank;
 } MPIDI_CH3_Pkt_lock_t;
 
+typedef struct MPIDI_CH3_Pkt_unlock {
+    MPIDI_CH3_Pkt_type_t type;
+    int lock_type;
+    MPI_Win target_win_handle;
+    MPI_Win source_win_handle;
+    int target_rank;            /* Used in unluck/flush response to look up the
+                                 * target state at the origin. */
+    int origin_rank;
+} MPIDI_CH3_Pkt_unlock_t;
+
+typedef struct MPIDI_CH3_Pkt_flush {
+    MPIDI_CH3_Pkt_type_t type;
+    int lock_type;
+    MPI_Win target_win_handle;
+    MPI_Win source_win_handle;
+    int target_rank;            /* Used in unluck/flush response to look up the
+                                 * target state at the origin. */
+    int origin_rank;
+} MPIDI_CH3_Pkt_flush_t;
+
 typedef struct MPIDI_CH3_Pkt_lock_granted {
     MPIDI_CH3_Pkt_type_t type;
     MPI_Win source_win_handle;
@@ -369,9 +408,12 @@ typedef struct MPIDI_CH3_Pkt_lock_granted {
                                  * target state at the origin. */
 } MPIDI_CH3_Pkt_lock_granted_t;
 
-typedef MPIDI_CH3_Pkt_lock_granted_t MPIDI_CH3_Pkt_flush_ack_t;
-typedef MPIDI_CH3_Pkt_lock_t MPIDI_CH3_Pkt_unlock_t;
-typedef MPIDI_CH3_Pkt_lock_t MPIDI_CH3_Pkt_flush_t;
+typedef struct MPIDI_CH3_Pkt_flush_ack {
+    MPIDI_CH3_Pkt_type_t type;
+    MPI_Win source_win_handle;
+    int target_rank;            /* Used in flush_ack response to look up the
+                                 * target state at the origin. */
+} MPIDI_CH3_Pkt_flush_ack_t;
 
 typedef struct MPIDI_CH3_Pkt_lock_put_unlock {
     MPIDI_CH3_Pkt_type_t type;
@@ -441,6 +483,7 @@ typedef union MPIDI_CH3_Pkt {
     MPIDI_CH3_Pkt_get_resp_t get_resp;
     MPIDI_CH3_Pkt_accum_t accum;
     MPIDI_CH3_Pkt_accum_immed_t accum_immed;
+    MPIDI_CH3_Pkt_get_accum_t get_accum;
     MPIDI_CH3_Pkt_lock_t lock;
     MPIDI_CH3_Pkt_lock_granted_t lock_granted;
     MPIDI_CH3_Pkt_unlock_t unlock;
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index 7f8b1c1..9b63cec 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -883,6 +883,7 @@ static int send_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr, MPIDI_CH3_P
 {
     MPIDI_CH3_Pkt_put_t *put_pkt = &rma_op->pkt.put;
     MPIDI_CH3_Pkt_accum_t *accum_pkt = &rma_op->pkt.accum;
+    MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &rma_op->pkt.get_accum;
     MPID_IOV iov[MPID_IOV_LIMIT];
     int mpi_errno = MPI_SUCCESS;
     int origin_dt_derived, target_dt_derived, iovcnt;
@@ -918,8 +919,8 @@ static int send_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr, MPIDI_CH3_P
         resp_req->dev.user_buf = rma_op->result_addr;
         resp_req->dev.user_count = rma_op->result_count;
         resp_req->dev.datatype = rma_op->result_datatype;
-        resp_req->dev.target_win_handle = accum_pkt->target_win_handle;
-        resp_req->dev.source_win_handle = accum_pkt->source_win_handle;
+        resp_req->dev.target_win_handle = get_accum_pkt->target_win_handle;
+        resp_req->dev.source_win_handle = get_accum_pkt->source_win_handle;
 
         if (!MPIR_DATATYPE_IS_PREDEFINED(resp_req->dev.datatype)) {
             MPID_Datatype *result_dtp = NULL;
@@ -929,12 +930,11 @@ static int send_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr, MPIDI_CH3_P
              * request is freed. */
         }
 
-        /* Note: Get_accumulate uses the same packet type as accumulate */
-        accum_pkt->request_handle = resp_req->handle;
+        get_accum_pkt->request_handle = resp_req->handle;
 
-        accum_pkt->flags = flags;
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) accum_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*accum_pkt);
+        get_accum_pkt->flags = flags;
+        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_pkt;
+        iov[0].MPID_IOV_LEN = sizeof(*get_accum_pkt);
     }
     else {
         accum_pkt->flags = flags;
@@ -998,9 +998,12 @@ static int send_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr, MPIDI_CH3_P
         if (rma_op->pkt.type == MPIDI_CH3_PKT_PUT) {
             put_pkt->dataloop_size = target_dtp->dataloop_size;
         }
-        else {
+        else if (rma_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE) {
             accum_pkt->dataloop_size = target_dtp->dataloop_size;
         }
+        else {
+            get_accum_pkt->dataloop_size = target_dtp->dataloop_size;
+        }
     }
 
     MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index af24f11..b6a9084 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -569,16 +569,16 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
         }
 
         else {
-            MPIDI_CH3_Pkt_accum_t *accum_pkt = &(new_ptr->pkt.accum);
-            MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
-            accum_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
+            MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &(new_ptr->pkt.get_accum);
+            MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
+            get_accum_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
                 win_ptr->disp_units[target_rank] * target_disp;
-            accum_pkt->count = target_count;
-            accum_pkt->datatype = target_datatype;
-            accum_pkt->dataloop_size = 0;
-            accum_pkt->op = op;
-            accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
-            accum_pkt->source_win_handle = win_ptr->handle;
+            get_accum_pkt->count = target_count;
+            get_accum_pkt->datatype = target_datatype;
+            get_accum_pkt->dataloop_size = 0;
+            get_accum_pkt->op = op;
+            get_accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+            get_accum_pkt->source_win_handle = win_ptr->handle;
 
             new_ptr->origin_addr = (void *) origin_addr;
             new_ptr->origin_count = origin_count;
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index d58e8fe..03e34c5 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -456,7 +456,7 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                                        MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
 {
-    MPIDI_CH3_Pkt_accum_t *accum_pkt = &pkt->accum;
+    MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &pkt->get_accum;
     MPID_Request *req = NULL;
     MPI_Aint true_lb, true_extent, extent;
     void *tmp_buf = NULL;
@@ -472,9 +472,9 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received accumulate pkt");
 
-    MPIU_Assert(accum_pkt->target_win_handle != MPI_WIN_NULL);
-    MPID_Win_get_ptr(accum_pkt->target_win_handle, win_ptr);
-    mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, accum_pkt->flags);
+    MPIU_Assert(get_accum_pkt->target_win_handle != MPI_WIN_NULL);
+    MPID_Win_get_ptr(get_accum_pkt->target_win_handle, win_ptr);
+    mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, get_accum_pkt->flags);
 
     data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
     data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
@@ -483,35 +483,35 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPIU_Object_set_ref(req, 1);
     *rreqp = req;
 
-    req->dev.user_count = accum_pkt->count;
-    req->dev.op = accum_pkt->op;
-    req->dev.real_user_buf = accum_pkt->addr;
-    req->dev.target_win_handle = accum_pkt->target_win_handle;
-    req->dev.source_win_handle = accum_pkt->source_win_handle;
-    req->dev.flags = accum_pkt->flags;
+    req->dev.user_count = get_accum_pkt->count;
+    req->dev.op = get_accum_pkt->op;
+    req->dev.real_user_buf = get_accum_pkt->addr;
+    req->dev.target_win_handle = get_accum_pkt->target_win_handle;
+    req->dev.source_win_handle = get_accum_pkt->source_win_handle;
+    req->dev.flags = get_accum_pkt->flags;
 
-    req->dev.resp_request_handle = accum_pkt->request_handle;
+    req->dev.resp_request_handle = get_accum_pkt->request_handle;
 
-    if (MPIR_DATATYPE_IS_PREDEFINED(accum_pkt->datatype)) {
+    if (MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype)) {
         MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
-        req->dev.datatype = accum_pkt->datatype;
+        req->dev.datatype = get_accum_pkt->datatype;
 
-        MPIR_Type_get_true_extent_impl(accum_pkt->datatype, &true_lb, &true_extent);
-        MPID_Datatype_get_extent_macro(accum_pkt->datatype, extent);
+        MPIR_Type_get_true_extent_impl(get_accum_pkt->datatype, &true_lb, &true_extent);
+        MPID_Datatype_get_extent_macro(get_accum_pkt->datatype, extent);
 
         /* Predefined types should always have zero lb */
         MPIU_Assert(true_lb == 0);
 
-        tmp_buf = MPIU_Malloc(accum_pkt->count * (MPIR_MAX(extent, true_extent)));
+        tmp_buf = MPIU_Malloc(get_accum_pkt->count * (MPIR_MAX(extent, true_extent)));
         if (!tmp_buf) {
             MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                 accum_pkt->count * MPIR_MAX(extent, true_extent));
+                                 get_accum_pkt->count * MPIR_MAX(extent, true_extent));
         }
 
         req->dev.user_buf = tmp_buf;
 
-        MPID_Datatype_get_size_macro(accum_pkt->datatype, type_size);
-        req->dev.recv_data_sz = type_size * accum_pkt->count;
+        MPID_Datatype_get_size_macro(get_accum_pkt->datatype, type_size);
+        req->dev.recv_data_sz = type_size * get_accum_pkt->count;
 
         mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
         MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
@@ -548,20 +548,20 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                                  "MPIDI_RMA_dtype_info");
         }
 
-        req->dev.dataloop = MPIU_Malloc(accum_pkt->dataloop_size);
+        req->dev.dataloop = MPIU_Malloc(get_accum_pkt->dataloop_size);
         if (!req->dev.dataloop) {
             MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                 accum_pkt->dataloop_size);
+                                 get_accum_pkt->dataloop_size);
         }
 
-        if (data_len >= sizeof(MPIDI_RMA_dtype_info) + accum_pkt->dataloop_size) {
+        if (data_len >= sizeof(MPIDI_RMA_dtype_info) + get_accum_pkt->dataloop_size) {
             /* copy all of dtype_info and dataloop */
             MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
             MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
-                        accum_pkt->dataloop_size);
+                        get_accum_pkt->dataloop_size);
 
             *buflen =
-                sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) + accum_pkt->dataloop_size;
+                sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) + get_accum_pkt->dataloop_size;
 
             /* All dtype data has been received, call req handler */
             mpi_errno = MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete(vc, req, &complete);
@@ -576,7 +576,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dtype_info;
             req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
             req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
-            req->dev.iov[1].MPID_IOV_LEN = accum_pkt->dataloop_size;
+            req->dev.iov[1].MPID_IOV_LEN = get_accum_pkt->dataloop_size;
             req->dev.iov_count = 2;
             *buflen = sizeof(MPIDI_CH3_Pkt_t);
         }

http://git.mpich.org/mpich.git/commitdiff/bfbb10489eeff8fd251da576b613d153ffa33fe4

commit bfbb10489eeff8fd251da576b613d153ffa33fe4
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sat Nov 1 18:57:03 2014 -0500

    Separate pkt handler of ACC and GACC into two handlers.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index 875e97b..dbb02c9 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -462,6 +462,8 @@ extern MPIDI_Process_t MPIDI_Process;
 #define MPIDI_REQUEST_TYPE_ACCUM_RESP_DERIVED_DT 10
 #define MPIDI_REQUEST_TYPE_PT_SINGLE_PUT 11
 #define MPIDI_REQUEST_TYPE_PT_SINGLE_ACCUM 12
+#define MPIDI_REQUEST_TYPE_GET_ACCUM_RESP 13
+#define MPIDI_REQUEST_TYPE_GET_ACCUM_RESP_DERIVED_DT 14
 
 
 #define MPIDI_Request_get_type(req_)						\
@@ -1785,6 +1787,8 @@ int MPIDI_CH3_PktHandler_Put( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
 			      MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_Accumulate( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
 				     MPIDI_msg_sz_t *, MPID_Request ** );
+int MPIDI_CH3_PktHandler_GetAccumulate( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *,
+                                        MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_Accumulate_Immed( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
 				     MPIDI_msg_sz_t *, MPID_Request ** );
 int MPIDI_CH3_PktHandler_CAS( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, 
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_pkt.c b/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
index 755f5e2..3c80f5b 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_pkt.c
@@ -605,7 +605,7 @@ int MPIDI_CH3_PktHandler_Init( MPIDI_CH3_PktHandler_Fcn *pktArray[],
     pktArray[MPIDI_CH3_PKT_FOP_RESP] =
         MPIDI_CH3_PktHandler_FOPResp;
     pktArray[MPIDI_CH3_PKT_GET_ACCUM] =
-        MPIDI_CH3_PktHandler_Accumulate;
+        MPIDI_CH3_PktHandler_GetAccumulate;
     pktArray[MPIDI_CH3_PKT_GET_ACCUM_RESP] =
         MPIDI_CH3_PktHandler_Get_AccumResp;
     /* End of default RMA operations */
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index a0df5bb..d58e8fe 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -343,12 +343,7 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     req->dev.source_win_handle = accum_pkt->source_win_handle;
     req->dev.flags = accum_pkt->flags;
 
-    if (accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM) {
-        req->dev.resp_request_handle = accum_pkt->request_handle;
-    }
-    else {
-        req->dev.resp_request_handle = MPI_REQUEST_NULL;
-    }
+    req->dev.resp_request_handle = MPI_REQUEST_NULL;
 
     if (MPIR_DATATYPE_IS_PREDEFINED(accum_pkt->datatype)) {
         MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RESP);
@@ -454,6 +449,153 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
 }
 
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3_PktHandler_GetAccumulate
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
+                                       MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
+{
+    MPIDI_CH3_Pkt_accum_t *accum_pkt = &pkt->accum;
+    MPID_Request *req = NULL;
+    MPI_Aint true_lb, true_extent, extent;
+    void *tmp_buf = NULL;
+    int complete = 0;
+    char *data_buf = NULL;
+    MPIDI_msg_sz_t data_len;
+    MPID_Win *win_ptr;
+    int mpi_errno = MPI_SUCCESS;
+    MPI_Aint type_size;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_GETACCUMULATE);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_GETACCUMULATE);
+
+    MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "received accumulate pkt");
+
+    MPIU_Assert(accum_pkt->target_win_handle != MPI_WIN_NULL);
+    MPID_Win_get_ptr(accum_pkt->target_win_handle, win_ptr);
+    mpi_errno = MPIDI_CH3_Start_rma_op_target(win_ptr, accum_pkt->flags);
+
+    data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
+    data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
+
+    req = MPID_Request_create();
+    MPIU_Object_set_ref(req, 1);
+    *rreqp = req;
+
+    req->dev.user_count = accum_pkt->count;
+    req->dev.op = accum_pkt->op;
+    req->dev.real_user_buf = accum_pkt->addr;
+    req->dev.target_win_handle = accum_pkt->target_win_handle;
+    req->dev.source_win_handle = accum_pkt->source_win_handle;
+    req->dev.flags = accum_pkt->flags;
+
+    req->dev.resp_request_handle = accum_pkt->request_handle;
+
+    if (MPIR_DATATYPE_IS_PREDEFINED(accum_pkt->datatype)) {
+        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
+        req->dev.datatype = accum_pkt->datatype;
+
+        MPIR_Type_get_true_extent_impl(accum_pkt->datatype, &true_lb, &true_extent);
+        MPID_Datatype_get_extent_macro(accum_pkt->datatype, extent);
+
+        /* Predefined types should always have zero lb */
+        MPIU_Assert(true_lb == 0);
+
+        tmp_buf = MPIU_Malloc(accum_pkt->count * (MPIR_MAX(extent, true_extent)));
+        if (!tmp_buf) {
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
+                                 accum_pkt->count * MPIR_MAX(extent, true_extent));
+        }
+
+        req->dev.user_buf = tmp_buf;
+
+        MPID_Datatype_get_size_macro(accum_pkt->datatype, type_size);
+        req->dev.recv_data_sz = type_size * accum_pkt->count;
+
+        mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
+        MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                             "**ch3|postrecv %s", "MPIDI_CH3_PKT_ACCUMULATE");
+        /* FIXME:  Only change the handling of completion if
+         * post_data_receive reset the handler.  There should
+         * be a cleaner way to do this */
+        if (!req->dev.OnDataAvail) {
+            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutAccumRespComplete;
+        }
+        /* return the number of bytes processed in this function */
+        *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
+
+        if (complete) {
+            mpi_errno = MPIDI_CH3_ReqHandler_PutAccumRespComplete(vc, req, &complete);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+            if (complete) {
+                *rreqp = NULL;
+                goto fn_exit;
+            }
+        }
+    }
+    else {
+        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP_DERIVED_DT);
+        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete;
+        req->dev.datatype = MPI_DATATYPE_NULL;
+        req->dev.OnFinal = MPIDI_CH3_ReqHandler_PutAccumRespComplete;
+
+        req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
+            MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
+        if (!req->dev.dtype_info) {
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
+                                 "MPIDI_RMA_dtype_info");
+        }
+
+        req->dev.dataloop = MPIU_Malloc(accum_pkt->dataloop_size);
+        if (!req->dev.dataloop) {
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
+                                 accum_pkt->dataloop_size);
+        }
+
+        if (data_len >= sizeof(MPIDI_RMA_dtype_info) + accum_pkt->dataloop_size) {
+            /* copy all of dtype_info and dataloop */
+            MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
+            MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
+                        accum_pkt->dataloop_size);
+
+            *buflen =
+                sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) + accum_pkt->dataloop_size;
+
+            /* All dtype data has been received, call req handler */
+            mpi_errno = MPIDI_CH3_ReqHandler_AccumRespDerivedDTComplete(vc, req, &complete);
+            MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                                 "**ch3|postrecv %s", "MPIDI_CH3_ACCUMULATE");
+            if (complete) {
+                *rreqp = NULL;
+                goto fn_exit;
+            }
+        }
+        else {
+            req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dtype_info;
+            req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
+            req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
+            req->dev.iov[1].MPID_IOV_LEN = accum_pkt->dataloop_size;
+            req->dev.iov_count = 2;
+            *buflen = sizeof(MPIDI_CH3_Pkt_t);
+        }
+
+    }
+
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                             "**ch3|postrecv %s", "MPIDI_CH3_PKT_ACCUMULATE");
+    }
+
+  fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_GETACCUMULATE);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
 /* Special accumulate for short data items entirely within the packet */
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3_PktHandler_Accumulate_Immed

http://git.mpich.org/mpich.git/commitdiff/38b20e57086e7a0b87aa3fd5a4b29f793c0b789d

commit 38b20e57086e7a0b87aa3fd5a4b29f793c0b789d
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Nov 2 22:27:49 2014 -0600

    Rewrite all synchronization routines.
    
    We use new algorithms for RMA synchronization
    functions and RMA epochs. The old implementation
    uses a lazy-issuing algorithm, which queues up
    all operations and issues them at end. This
    forbid opportunites to do hardware RMA operations
    and can use up all memory resources when we
    queue up large number of operations.
    
    Here we use a new algorithm, which will initialize
    the synchonization at beginning, and issue operations
    as soon as the synchronization is finished.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/include/mpid_nem_inline.h b/src/mpid/ch3/channels/nemesis/include/mpid_nem_inline.h
index 7f21c6d..f858ac3 100644
--- a/src/mpid/ch3/channels/nemesis/include/mpid_nem_inline.h
+++ b/src/mpid/ch3/channels/nemesis/include/mpid_nem_inline.h
@@ -16,6 +16,8 @@
 
 extern int MPID_nem_lmt_shm_pending;
 extern MPID_nem_cell_ptr_t MPID_nem_prefetched_cell;
+extern int num_active_issued_win;
+extern int num_passive_win;
 
 static inline int MPID_nem_mpich_send_header (void* buf, int size, MPIDI_VC_t *vc, int *again);
 static inline int MPID_nem_mpich_sendv (MPID_IOV **iov, int *n_iov, MPIDI_VC_t *vc, int *again);
@@ -39,7 +41,8 @@ static inline void MPID_nem_mpich_send_seg (MPID_Segment *segment, MPIDI_msg_sz_
     (!MPID_nem_local_lmt_pending &&             \
      !MPIDI_CH3I_shm_active_send &&             \
      !MPIDI_CH3I_Sendq_head(MPIDI_CH3I_shm_sendq) &&       \
-     !MPIDU_Sched_are_pending())
+     !MPIDU_Sched_are_pending() &&              \
+     !num_active_issued_win && !num_passive_win)
 
 #undef FUNCNAME
 #define FUNCNAME MPID_nem_mpich_send_header
diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_progress.c b/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
index 3df4f8e..569cfa1 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
@@ -472,11 +472,13 @@ int MPIDI_CH3I_Progress (MPID_Progress_state *progress_state, int is_blocking)
 #endif /* HAVE_LIBHCOLL */
 
         /* make progress on RMA */
+        if (num_active_issued_win > 0 || num_passive_win > 0) {
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_global(&made_progress);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
         if (made_progress)
             MPIDI_CH3_Progress_signal_completion();
+        }
 
         /* in the case of progress_wait, bail out if anything completed (CC-1) */
         if (is_blocking) {
diff --git a/src/mpid/ch3/channels/sock/include/mpidi_ch3_impl.h b/src/mpid/ch3/channels/sock/include/mpidi_ch3_impl.h
index 3d34a4b..e1689c9 100644
--- a/src/mpid/ch3/channels/sock/include/mpidi_ch3_impl.h
+++ b/src/mpid/ch3/channels/sock/include/mpidi_ch3_impl.h
@@ -10,6 +10,8 @@
 #include "mpidimpl.h"
 #include "ch3usock.h"
 
+extern int num_active_issued_win;
+extern int num_passive_win;
 
 /* This is all socket connection definitions */
 
diff --git a/src/mpid/ch3/channels/sock/src/ch3_progress.c b/src/mpid/ch3/channels/sock/src/ch3_progress.c
index c0cab1a..ead5e47 100644
--- a/src/mpid/ch3/channels/sock/src/ch3_progress.c
+++ b/src/mpid/ch3/channels/sock/src/ch3_progress.c
@@ -96,9 +96,11 @@ static int MPIDI_CH3i_Progress_test(void)
 #endif /* HAVE_LIBHCOLL */
 
     /* make progress on RMA */
+    if (num_active_issued_win > 0 || num_passive_win > 0) {
     mpi_errno = MPIDI_CH3I_RMA_Make_progress_global(&made_progress);
     if (mpi_errno)
         MPIU_ERR_POP(mpi_errno);
+    }
 
     mpi_errno = MPIDU_Sock_wait(MPIDI_CH3I_sock_set, 0, &event);
 
@@ -209,6 +211,7 @@ static int MPIDI_CH3i_Progress_wait(MPID_Progress_state * progress_state)
 #endif /* HAVE_LIBHCOLL */
 
         /* make progress on RMA */
+        if (num_active_issued_win > 0 || num_passive_win > 0) {
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_global(&made_progress);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
@@ -216,6 +219,7 @@ static int MPIDI_CH3i_Progress_wait(MPID_Progress_state * progress_state)
             MPIDI_CH3_Progress_signal_completion();
             break;
         }
+        }
 
 #       ifdef MPICH_IS_THREADED
 
diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 1bb0a54..f79d384 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -18,6 +18,8 @@ int MPIDI_CH3I_RMA_Make_progress_win(MPID_Win * win_ptr, int *made_progress);
 
 extern struct MPIDI_RMA_Op *global_rma_op_pool, *global_rma_op_pool_tail, *global_rma_op_pool_start;
 extern struct MPIDI_RMA_Target *global_rma_target_pool, *global_rma_target_pool_tail, *global_rma_target_pool_start;
+extern int num_active_issued_win;
+extern int num_passive_win;
 
 /* MPIDI_CH3I_Win_op_alloc(): get a new op element from op pool and
  * initialize it. If we cannot get one, return NULL. */
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index aba695e..93856ab 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -353,14 +353,6 @@ struct MPIDI_Win_target_state {
     struct MPIDI_RMA_Op *at_rma_ops_list_tail;                           \
     enum MPIDI_Win_epoch_states epoch_state;                             \
     int epoch_count;                                                     \
-    int fence_issued;   /* Indicates if fence has been called, and if an \
-                           active target fence epoch is possible. This   \
-                           is maintained separately from the epoch state;\
-                           this state must be updated collectively (in   \
-                           fence) to ensure that the fence state across  \
-                           all processes remains consistent. */          \
-    MPID_Group *start_group_ptr; /* group passed in MPI_Win_start */     \
-    int start_assert;   /* assert passed to MPI_Win_start */             \
     int shm_allocated; /* flag: TRUE iff this window has a shared memory \
                           region associated with it */                   \
     struct MPIDI_RMA_Op *op_pool_start; /* start pointer used for freeing */\
@@ -383,6 +375,16 @@ struct MPIDI_Win_target_state {
     int active_req_cnt; /* keep track of number of active requests in    \
                            current epoch, i.e., number of issued but     \
                            incomplete RMA operations. */                 \
+    MPI_Request fence_sync_req;                                          \
+    MPI_Request *start_req;                                              \
+    int *start_ranks_in_win_grp;                                         \
+    int start_grp_size;                                                  \
+    int lock_all_assert;                                                 \
+    int lock_epoch_count; /* number of lock access epoch on this process */ \
+    int outstanding_locks; /* when issuing multiple lock requests in     \
+                            MPI_WIN_LOCK_ALL, this counter keeps track   \
+                            of number of locks not being granted yet. */ \
+    int outstanding_unlocks;                                             \
 
 #ifdef MPIDI_CH3_WIN_DECL
 #define MPID_DEV_WIN_DECL \
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index 01afed9..7f8b1c1 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -124,6 +124,9 @@ static inline int check_window_state(MPID_Win *win_ptr, int *made_progress, int
             MPID_Request_release(fence_req_ptr);
             win_ptr->fence_sync_req = MPI_REQUEST_NULL;
 
+            num_active_issued_win--;
+            MPIU_Assert(num_active_issued_win >= 0);
+
             (*made_progress) = 1;
         }
         else {
@@ -137,6 +140,9 @@ static inline int check_window_state(MPID_Win *win_ptr, int *made_progress, int
                we do not create PSCW requests on window. */
             win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;
 
+            num_active_issued_win--;
+            MPIU_Assert(num_active_issued_win >= 0);
+
             (*made_progress) = 1;
         }
         else {
@@ -157,6 +163,9 @@ static inline int check_window_state(MPID_Win *win_ptr, int *made_progress, int
             MPIU_Assert(i == win_ptr->start_grp_size);
             win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;
 
+            num_active_issued_win--;
+            MPIU_Assert(num_active_issued_win >= 0);
+
             (*made_progress) = 1;
 
             MPIU_Free(win_ptr->start_req);
diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index 13c88ff..be20bd2 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -363,239 +363,134 @@ static inline int rma_list_gc(MPID_Win * win_ptr,
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
 {
-    int mpi_errno = MPI_SUCCESS;
-    int comm_size;
-    int *rma_target_proc, *nops_to_proc, i, total_op_count, *curr_ops_cnt;
-    MPIDI_RMA_Op_t *curr_ptr;
-    MPIDI_RMA_Ops_list_t *ops_list;
-    MPIDI_RMA_Ops_list_t *ops_list_tail;
-    MPID_Comm *comm_ptr;
-    MPID_Progress_state progress_state;
+    int i, made_progress = 0;
+    int local_completed = 0, remote_completed = 0;
+    MPIDI_RMA_Target_t *curr_target = NULL;
     int errflag = FALSE;
-    MPIU_CHKLMEM_DECL(3);
+    int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FENCE);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FENCE);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_NONE &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_FENCE,
+    MPIU_ERR_CHKANDJUMP((win_ptr->states.access_state != MPIDI_RMA_NONE &&
+                         win_ptr->states.access_state != MPIDI_RMA_FENCE_ISSUED &&
+                         win_ptr->states.access_state != MPIDI_RMA_FENCE_GRANTED) ||
+                        win_ptr->states.exposure_state != MPIDI_RMA_NONE,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    /* Note that the NOPRECEDE and NOSUCCEED must be specified by all processes
-     * in the window's group if any specify it */
-    if (assert & MPI_MODE_NOPRECEDE) {
-        /* Error: Operations were issued and the user claimed NOPRECEDE */
-        MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_FENCE,
-                            mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
-
-        win_ptr->fence_issued = (assert & MPI_MODE_NOSUCCEED) ? 0 : 1;
-        goto shm_barrier;
-    }
-
-    if (win_ptr->fence_issued == 0) {
-        /* win_ptr->fence_issued == 0 means either this is the very first
-         * call to fence or the preceding fence had the
-         * MPI_MODE_NOSUCCEED assert.
-         *
-         * If this fence has MPI_MODE_NOSUCCEED, do nothing and return.
-         * Otherwise just increment the fence count and return. */
-
-        if (!(assert & MPI_MODE_NOSUCCEED))
-            win_ptr->fence_issued = 1;
-    }
-    else {
-        int nRequest = 0;
-        int nRequestNew = 0;
-
-        /* Ensure ordering of load/store operations. */
-        if (win_ptr->shm_allocated == TRUE) {
-            OPA_read_write_barrier();
-        }
+    win_ptr->posted_ops_cnt = 0;
 
-        /* This is the second or later fence. Do all the preceding RMA ops. */
-        comm_ptr = win_ptr->comm_ptr;
-        /* First inform every process whether it is a target of RMA
-         * ops from this process */
-        comm_size = comm_ptr->local_size;
-
-        MPIU_CHKLMEM_MALLOC(rma_target_proc, int *, comm_size * sizeof(int),
-                            mpi_errno, "rma_target_proc");
-        for (i = 0; i < comm_size; i++)
-            rma_target_proc[i] = 0;
-
-        /* keep track of no. of ops to each proc. Needed for knowing
-         * whether or not to decrement the completion counter. The
-         * completion counter is decremented only on the last
-         * operation. */
-        MPIU_CHKLMEM_MALLOC(nops_to_proc, int *, comm_size * sizeof(int),
-                            mpi_errno, "nops_to_proc");
-        for (i = 0; i < comm_size; i++)
-            nops_to_proc[i] = 0;
-
-        /* Note, active target uses the following ops list, and passive
-         * target uses win_ptr->targets[..] */
-        ops_list = &win_ptr->at_rma_ops_list;
-        ops_list_tail = &win_ptr->at_rma_ops_list_tail;
-
-        /* set rma_target_proc[i] to 1 if rank i is a target of RMA
-         * ops from this process */
-        total_op_count = 0;
-        curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
-        while (curr_ptr != NULL) {
-            total_op_count++;
-            rma_target_proc[curr_ptr->target_rank] = 1;
-            nops_to_proc[curr_ptr->target_rank]++;
-            curr_ptr = curr_ptr->next;
+    if (assert & MPI_MODE_NOPRECEDE) {
+        if (assert & MPI_MODE_NOSUCCEED) {
+            goto fn_exit;
         }
+        else {
+            /* It is possible that there is a IBARRIER in MPI_WIN_FENCE with
+               MODE_NOPRECEDE not being completed, we let the progress engine
+               to delete its request when it is completed. */
+            if (win_ptr->fence_sync_req != MPI_REQUEST_NULL) {
+                MPID_Request *req_ptr;
+                MPID_Request_get_ptr(win_ptr->fence_sync_req, req_ptr);
+                MPID_Request_release(req_ptr);
+                win_ptr->fence_sync_req = MPI_REQUEST_NULL;
+                win_ptr->states.access_state = MPIDI_RMA_NONE;
+            }
 
-        MPIU_CHKLMEM_MALLOC(curr_ops_cnt, int *, comm_size * sizeof(int),
-                            mpi_errno, "curr_ops_cnt");
-        for (i = 0; i < comm_size; i++)
-            curr_ops_cnt[i] = 0;
-        /* do a reduce_scatter_block (with MPI_SUM) on rma_target_proc.
-         * As a result,
-         * each process knows how many other processes will be doing
-         * RMA ops on its window */
-
-        /* first initialize the completion counter. */
-        win_ptr->at_completion_counter += comm_size;
-
-        mpi_errno = MPIR_Reduce_scatter_block_impl(MPI_IN_PLACE, rma_target_proc, 1,
-                                                   MPI_INT, MPI_SUM, comm_ptr, &errflag);
-        /* result is stored in rma_target_proc[0] */
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
-        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+            if (win_ptr->shm_allocated == TRUE) {
+                MPID_Comm *node_comm_ptr = win_ptr->comm_ptr->node_comm;
 
-        /* Ensure ordering of load/store operations. */
-        if (win_ptr->shm_allocated == TRUE) {
-            OPA_read_write_barrier();
-        }
+                /* Ensure ordering of load/store operations. */
+                OPA_read_write_barrier();
 
-        /* Set the completion counter */
-        /* FIXME: MT: this needs to be done atomically because other
-         * procs have the address and could decrement it. */
-        win_ptr->at_completion_counter -= comm_size;
-        win_ptr->at_completion_counter += rma_target_proc[0];
-
-        i = 0;
-        curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
-        while (curr_ptr != NULL) {
-            MPIDI_CH3_Pkt_flags_t flags = MPIDI_CH3_PKT_FLAG_NONE;
+                mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
+                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
-            /* The completion counter at the target is decremented only on
-             * the last RMA operation. */
-            if (curr_ops_cnt[curr_ptr->target_rank] == nops_to_proc[curr_ptr->target_rank] - 1) {
-                flags = MPIDI_CH3_PKT_FLAG_RMA_AT_COMPLETE;
+                /* Ensure ordering of load/store operations. */
+                OPA_read_write_barrier();
             }
 
-            mpi_errno = MPIDI_CH3I_Issue_rma_op(curr_ptr, win_ptr, flags);
-            if (mpi_errno)
-                MPIU_ERR_POP(mpi_errno);
+            mpi_errno = MPIR_Ibarrier_impl(win_ptr->comm_ptr, &(win_ptr->fence_sync_req));
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-            i++;
-            curr_ops_cnt[curr_ptr->target_rank]++;
-            /* If the request is null, we can remove it immediately */
-            if (!curr_ptr->request) {
-                MPIDI_CH3I_RMA_Ops_free_and_next(win_ptr, ops_list, ops_list_tail, &curr_ptr);
-            }
-            else {
-                nRequest++;
-                curr_ptr = curr_ptr->next;
-                /* The test on the difference is to reduce the number
-                 * of times the partial complete routine is called. Without
-                 * this, significant overhead is added once the
-                 * number of requests exceeds the threshold, since the
-                 * number that are completed in a call may be small. */
-                if (nRequest > MPIR_CVAR_CH3_RMA_NREQUEST_THRESHOLD &&
-                    nRequest - nRequestNew > MPIR_CVAR_CH3_RMA_NREQUEST_NEW_THRESHOLD) {
-                    int nDone = 0;
-                    mpi_errno = poke_progress_engine();
-                    if (mpi_errno != MPI_SUCCESS)
-                        MPIU_ERR_POP(mpi_errno);
+            win_ptr->states.access_state = MPIDI_RMA_FENCE_ISSUED;
+            num_active_issued_win++;
 
-                    mpi_errno = rma_list_gc(win_ptr, ops_list, ops_list_tail, curr_ptr, &nDone);
-                    if (mpi_errno != MPI_SUCCESS)
-                        MPIU_ERR_POP(mpi_errno);
-                    /* if (nDone > 0) printf("nDone = %d\n", nDone); */
-                    nRequest -= nDone;
-                    nRequestNew = nRequest;
-                }
-            }
+            goto fn_exit;
         }
+    }
 
-        /* We replaced a loop over an array of requests with a list of the
-         * incomplete requests.  The reason to do
-         * that is for long lists - processing the entire list until
-         * all are done introduces a potentially n^2 time.  In
-         * testing with test/mpi/perf/manyrma.c , the number of iterations
-         * within the "while (total_op_count) was O(total_op_count).
-         *
-         * Another alternative is to create a more compressed list (storing
-         * only the necessary information, reducing the number of cache lines
-         * needed while looping through the requests.
-         */
-        if (total_op_count) {
-            mpi_errno = rma_list_complete(win_ptr, ops_list, ops_list_tail);
+    if (win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED) {
+        while (win_ptr->states.access_state != MPIDI_RMA_FENCE_GRANTED) {
+            mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
         }
+    }
 
-        /* MT: avoid processing unissued operations enqueued by other threads
-           in rma_list_complete() */
-        curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
-        if (curr_ptr && !curr_ptr->request)
-            goto finish_up;
-        MPIU_Assert(MPIDI_CH3I_RMA_Ops_isempty(ops_list));
+    /* Set sync_flag in target structs. */
+    for (i = 0; i < win_ptr->num_slots; i++) {
+        curr_target = win_ptr->slots[i].target_list;
+        while (curr_target != NULL) {
 
- finish_up:
-	/* wait for all operations from other processes to finish */
-        if (win_ptr->at_completion_counter) {
-            MPID_Progress_start(&progress_state);
-            while (win_ptr->at_completion_counter) {
-                mpi_errno = MPID_Progress_wait(&progress_state);
-                /* --BEGIN ERROR HANDLING-- */
-                if (mpi_errno != MPI_SUCCESS) {
-                    MPID_Progress_end(&progress_state);
-                    MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
-                }
-                /* --END ERROR HANDLING-- */
+            /* set sync_flag in sync struct */
+            if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
+                curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
+                curr_target->sync.have_remote_incomplete_ops = 0;
+                curr_target->sync.outstanding_acks++;
             }
-            MPID_Progress_end(&progress_state);
+            curr_target = curr_target->next;
         }
+    }
 
-        if (assert & MPI_MODE_NOSUCCEED) {
-            win_ptr->fence_issued = 0;
+    /* Issue out all operations. */
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    /* Wait for remote completion. */
+    do {
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr,
+                                                   &local_completed,
+                                                   &remote_completed);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (!remote_completed) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
         }
+    } while (!remote_completed);
 
-        win_ptr->epoch_state = MPIDI_EPOCH_NONE;
-    }
+    /* Cleanup all targets on window. */
+    mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
- shm_barrier:
-    if (!(assert & MPI_MODE_NOSUCCEED)) {
-        /* In a FENCE without MPI_MODE_NOSUCCEED (which means this FENCE
-           might start a new Active epoch), if SHM is allocated, perform
-           a barrier among processes on the same node, to prevent one
-           process modifying another process's memory before that process
-           starts an epoch. */
+    MPIU_Assert(win_ptr->non_empty_slots == 0);
 
-        if (win_ptr->shm_allocated == TRUE) {
-            MPID_Comm *node_comm_ptr = win_ptr->comm_ptr->node_comm;
+    /* Ensure ordering of load/store operations. */
+    if (win_ptr->shm_allocated == TRUE) {
+        OPA_read_write_barrier();
+    }
 
-            /* Ensure ordering of load/store operations. */
-            OPA_read_write_barrier();
+    mpi_errno = MPIR_Barrier_impl(win_ptr->comm_ptr, &errflag);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
-            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
-            if (mpi_errno) {goto fn_fail;}
+    /* Ensure ordering of load/store operations. */
+    if (win_ptr->shm_allocated == TRUE) {
+        OPA_read_write_barrier();
+    }
 
-            /* Ensure ordering of load/store operations. */
-            OPA_read_write_barrier();
-        }
+    if (assert & MPI_MODE_NOSUCCEED) {
+        win_ptr->states.access_state = MPIDI_RMA_NONE;
     }
+    else {
+        win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
+    }
+
+    /* There should be no active requests. */
+    MPIU_Assert(win_ptr->active_req_cnt == 0);
 
   fn_exit:
-    MPIU_CHKLMEM_FREEALL();
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FENCE);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
@@ -605,106 +500,107 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
 }
 
 
+static int fill_ranks_in_win_grp(MPID_Win *win_ptr, MPID_Group *group_ptr, int *ranks_in_win_grp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int i, *ranks_in_grp;
+    MPID_Group *win_grp_ptr;
+    MPIU_CHKLMEM_DECL(1);
+    MPIDI_STATE_DECL(MPID_STATE_FILL_RANKS_IN_WIN_GRP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_FILL_RANKS_IN_WIN_GRP);
+
+    MPIU_CHKLMEM_MALLOC(ranks_in_grp, int *, group_ptr->size * sizeof(int),
+                        mpi_errno, "ranks_in_grp");
+    for (i = 0; i < group_ptr->size; i++) ranks_in_grp[i] = i;
+
+    mpi_errno = MPIR_Comm_group_impl(win_ptr->comm_ptr, &win_grp_ptr);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIR_Group_translate_ranks_impl(group_ptr, group_ptr->size,
+                                                ranks_in_grp, win_grp_ptr, ranks_in_win_grp);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    mpi_errno = MPIR_Group_free_impl(win_grp_ptr);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIU_CHKLMEM_FREEALL();
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_FILL_RANKS_IN_WIN_GRP);
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
 #undef FUNCNAME
 #define FUNCNAME MPIDI_Win_post
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_Win_post(MPID_Group * post_grp_ptr, int assert, MPID_Win * win_ptr)
 {
+    int *post_ranks_in_win_grp;
     int mpi_errno = MPI_SUCCESS;
-    MPID_Group *win_grp_ptr;
-    int i, post_grp_size, *ranks_in_post_grp, *ranks_in_win_grp, dst, rank;
-    MPID_Comm *win_comm_ptr;
-    MPIU_CHKLMEM_DECL(4);
+    MPIU_CHKLMEM_DECL(3);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_POST);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_POST);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_NONE &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_START,
+    /* Note that here we cannot distinguish if this exposure epoch is overlapped
+       with an exposure epoch of FENCE (which is not allowed), since FENCE may be
+       ended up with not unsetting the window state. We can only detect if this
+       exposure epoch is overlapped with another exposure epoch of PSCW. */
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.exposure_state != MPIDI_RMA_NONE,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    /* Track access epoch state */
-    if (win_ptr->epoch_state == MPIDI_EPOCH_START)
-        win_ptr->epoch_state = MPIDI_EPOCH_PSCW;
-    else
-        win_ptr->epoch_state = MPIDI_EPOCH_POST;
+    win_ptr->states.exposure_state = MPIDI_RMA_PSCW_EXPO;
 
-    /* Even though we would want to reset the fence counter to keep
-     * the user from using the previous fence to mark the beginning of
-     * a fence epoch if he switched from fence to lock-unlock
-     * synchronization, we cannot do this because fence_issued must be
-     * updated collectively */
-
-    post_grp_size = post_grp_ptr->size;
+    win_ptr->at_completion_counter += post_grp_ptr->size;
 
     /* Ensure ordering of load/store operations. */
     if (win_ptr->shm_allocated == TRUE) {
         OPA_read_write_barrier();
     }
 
-    /* initialize the completion counter */
-    win_ptr->at_completion_counter += post_grp_size;
-
     if ((assert & MPI_MODE_NOCHECK) == 0) {
         MPI_Request *req;
         MPI_Status *status;
+        int i, post_grp_size, dst, rank;
+        MPID_Comm *win_comm_ptr;
 
         /* NOCHECK not specified. We need to notify the source
          * processes that Post has been called. */
 
-        /* We need to translate the ranks of the processes in
-         * post_group to ranks in win_ptr->comm_ptr, so that we
-         * can do communication */
-
-        MPIU_CHKLMEM_MALLOC(ranks_in_post_grp, int *,
-                            post_grp_size * sizeof(int), mpi_errno, "ranks_in_post_grp");
-        MPIU_CHKLMEM_MALLOC(ranks_in_win_grp, int *,
-                            post_grp_size * sizeof(int), mpi_errno, "ranks_in_win_grp");
-
-        for (i = 0; i < post_grp_size; i++) {
-            ranks_in_post_grp[i] = i;
-        }
-
+        post_grp_size = post_grp_ptr->size;
         win_comm_ptr = win_ptr->comm_ptr;
-
-        mpi_errno = MPIR_Comm_group_impl(win_comm_ptr, &win_grp_ptr);
-        if (mpi_errno)
-            MPIU_ERR_POP(mpi_errno);
-
-
-        mpi_errno = MPIR_Group_translate_ranks_impl(post_grp_ptr, post_grp_size, ranks_in_post_grp,
-                                                    win_grp_ptr, ranks_in_win_grp);
-        if (mpi_errno)
-            MPIU_ERR_POP(mpi_errno);
-
         rank = win_ptr->comm_ptr->rank;
 
-        MPIU_CHKLMEM_MALLOC(req, MPI_Request *, post_grp_size * sizeof(MPI_Request), mpi_errno,
-                            "req");
-        MPIU_CHKLMEM_MALLOC(status, MPI_Status *, post_grp_size * sizeof(MPI_Status), mpi_errno,
-                            "status");
+        MPIU_CHKLMEM_MALLOC(post_ranks_in_win_grp, int *,
+                            post_grp_size * sizeof(int), mpi_errno, "post_ranks_in_win_grp");
+        mpi_errno = fill_ranks_in_win_grp(win_ptr, post_grp_ptr, post_ranks_in_win_grp);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        MPIU_CHKLMEM_MALLOC(req, MPI_Request *, post_grp_size * sizeof(MPI_Request),
+                            mpi_errno, "req");
+        MPIU_CHKLMEM_MALLOC(status, MPI_Status *, post_grp_size * sizeof(MPI_Status),
+                            mpi_errno, "status");
 
         /* Send a 0-byte message to the source processes */
         for (i = 0; i < post_grp_size; i++) {
-            dst = ranks_in_win_grp[i];
+            dst = post_ranks_in_win_grp[i];
 
-            /* FIXME: Short messages like this shouldn't normally need a
-             * request - this should consider using the ch3 call to send
-             * a short message and return a request only if the message is
-             * not delivered. */
             if (dst != rank) {
                 MPID_Request *req_ptr;
                 mpi_errno = MPID_Isend(&i, 0, MPI_INT, dst, SYNC_POST_TAG, win_comm_ptr,
                                        MPID_CONTEXT_INTRA_PT2PT, &req_ptr);
-                if (mpi_errno)
-                    MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
                 req[i] = req_ptr->handle;
             }
             else {
                 req[i] = MPI_REQUEST_NULL;
             }
         }
+
         mpi_errno = MPIR_Waitall_impl(post_grp_size, req, status);
         if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS)
             MPIU_ERR_POP(mpi_errno);
@@ -719,10 +615,6 @@ int MPIDI_Win_post(MPID_Group * post_grp_ptr, int assert, MPID_Win * win_ptr)
             }
         }
         /* --END ERROR HANDLING-- */
-
-        mpi_errno = MPIR_Group_free_impl(win_grp_ptr);
-        if (mpi_errno)
-            MPIU_ERR_POP(mpi_errno);
     }
 
   fn_exit:
@@ -736,186 +628,124 @@ int MPIDI_Win_post(MPID_Group * post_grp_ptr, int assert, MPID_Win * win_ptr)
 }
 
 
-static int recv_post_msgs(MPID_Win * win_ptr, int *ranks_in_win_grp, int local)
+#undef FUNCNAME
+#define FUNCNAME MPIDI_Win_start
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_Win_start(MPID_Group * group_ptr, int assert, MPID_Win * win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
-    int start_grp_size, src, rank, i, j;
-    MPI_Request *req;
-    MPI_Status *status;
-    MPID_Comm *comm_ptr = win_ptr->comm_ptr;
-    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
     MPIU_CHKLMEM_DECL(2);
-    MPIDI_STATE_DECL(MPID_STATE_RECV_POST_MSGS);
+    MPIU_CHKPMEM_DECL(2);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_START);
 
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_RECV_POST_MSGS);
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_START);
 
-    /* Wait for 0-byte messages from processes either on the same node
-     * or not (depending on the "local" parameter), so we know they
-     * have entered post. */
+    /* Note that here we cannot distinguish if this access epoch is overlapped
+       with an access epoch of FENCE (which is not allowed), since FENCE may be
+       ended up with not unsetting the window state. We can only detect if this
+       access epoch is overlapped with another access epoch of PSCW or Passive
+       Target. */
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_NONE &&
+                        win_ptr->states.access_state != MPIDI_RMA_FENCE_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_FENCE_GRANTED,
+                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    start_grp_size = win_ptr->start_group_ptr->size;
+    win_ptr->start_grp_size = group_ptr->size;
 
-    rank = win_ptr->comm_ptr->rank;
-    MPIU_CHKLMEM_MALLOC(req, MPI_Request *, start_grp_size * sizeof(MPI_Request), mpi_errno, "req");
-    MPIU_CHKLMEM_MALLOC(status, MPI_Status *, start_grp_size * sizeof(MPI_Status), mpi_errno,
-                        "status");
+    if ((assert & MPI_MODE_NOCHECK) == 0) {
+        int i, intra_cnt, inter_cnt;
+        MPI_Request *intra_start_req = NULL;
+        MPI_Status *intra_start_status = NULL;
+        MPID_Comm *comm_ptr = win_ptr->comm_ptr;
+        int rank = comm_ptr->rank;
+
+        /* wait for messages from local processes */
+        MPIU_CHKPMEM_MALLOC(win_ptr->start_ranks_in_win_grp, int *, win_ptr->start_grp_size * sizeof(int),
+                            mpi_errno, "win_ptr->start_ranks_in_win_grp");
+        mpi_errno = fill_ranks_in_win_grp(win_ptr, group_ptr, win_ptr->start_ranks_in_win_grp);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-    j = 0;
-    for (i = 0; i < start_grp_size; i++) {
-        src = ranks_in_win_grp[i];
+        /* post IRECVs */
+        MPIU_CHKPMEM_MALLOC(win_ptr->start_req, MPI_Request *,
+                            win_ptr->start_grp_size * sizeof(MPI_Request),
+                            mpi_errno, "win_ptr->start_req");
 
-        if (src == rank)
-            continue;
+        if (win_ptr->shm_allocated == TRUE) {
+            int node_comm_size = comm_ptr->node_comm->local_size;
+            MPIU_CHKLMEM_MALLOC(intra_start_req, MPI_Request *,
+                                node_comm_size * sizeof(MPI_Request),
+                                mpi_errno, "intra_start_req");
+            MPIU_CHKLMEM_MALLOC(intra_start_status, MPI_Status *,
+                                node_comm_size * sizeof(MPI_Status),
+                                mpi_errno, "intra_start_status");
+        }
 
-        if (local && win_ptr->shm_allocated == TRUE) {
+        intra_cnt = 0;
+        for (i = 0; i < win_ptr->start_grp_size; i++) {
             MPID_Request *req_ptr;
+            MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+            int src = win_ptr->start_ranks_in_win_grp[i];
 
-            MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
-            MPIDI_Comm_get_vc(win_ptr->comm_ptr, src, &target_vc);
+            if (src != rank) {
+                MPIDI_Comm_get_vc(comm_ptr, rank, &orig_vc);
+                MPIDI_Comm_get_vc(comm_ptr, src, &target_vc);
 
-            if (orig_vc->node_id == target_vc->node_id) {
                 mpi_errno = MPID_Irecv(NULL, 0, MPI_INT, src, SYNC_POST_TAG,
                                        comm_ptr, MPID_CONTEXT_INTRA_PT2PT, &req_ptr);
-                if (mpi_errno)
-                    MPIU_ERR_POP(mpi_errno);
-                req[j++] = req_ptr->handle;
-            }
-        }
-        else if (!local) {
-            MPID_Request *req_ptr;
+                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-            MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
-            MPIDI_Comm_get_vc(win_ptr->comm_ptr, src, &target_vc);
-
-            if (win_ptr->shm_allocated != TRUE ||
-                orig_vc->node_id != target_vc->node_id) {
-                mpi_errno = MPID_Irecv(NULL, 0, MPI_INT, src, SYNC_POST_TAG,
-                                       comm_ptr, MPID_CONTEXT_INTRA_PT2PT, &req_ptr);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-                req[j++] = req_ptr->handle;
+                if (win_ptr->shm_allocated == TRUE &&
+                    orig_vc->node_id == target_vc->node_id) {
+                    intra_start_req[intra_cnt++] = req_ptr->handle;
+                    win_ptr->start_req[i] = MPI_REQUEST_NULL;
+                }
+                else {
+                    win_ptr->start_req[i] = req_ptr->handle;
+                    inter_cnt++;
+                }
+            }
+            else {
+                win_ptr->start_req[i] = MPI_REQUEST_NULL;
             }
         }
-    }
 
-    if (j) {
-        mpi_errno = MPIR_Waitall_impl(j, req, status);
-        if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS)
-            MPIU_ERR_POP(mpi_errno);
-        /* --BEGIN ERROR HANDLING-- */
-        if (mpi_errno == MPI_ERR_IN_STATUS) {
-            for (i = 0; i < j; i++) {
-                if (status[i].MPI_ERROR != MPI_SUCCESS) {
-                    mpi_errno = status[i].MPI_ERROR;
-                    MPIU_ERR_POP(mpi_errno);
+        /* for targets on SHM, waiting until their IRECVs to be finished */
+        if (intra_cnt) {
+            mpi_errno = MPIR_Waitall_impl(intra_cnt, intra_start_req, intra_start_status);
+            if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS)
+                MPIU_ERR_POP(mpi_errno);
+            /* --BEGIN ERROR HANDLING-- */
+            if (mpi_errno == MPI_ERR_IN_STATUS) {
+                for (i = 0; i < intra_cnt; i++) {
+                    if (intra_start_status[i].MPI_ERROR != MPI_SUCCESS) {
+                        mpi_errno = intra_start_status[i].MPI_ERROR;
+                        MPIU_ERR_POP(mpi_errno);
+                    }
                 }
             }
+            /* --END ERROR HANDLING-- */
         }
-        /* --END ERROR HANDLING-- */
-    }
-
-  fn_fail:
-    MPIU_CHKLMEM_FREEALL();
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_RECV_POST_MSGS);
-    return mpi_errno;
-}
 
-static int fill_ranks_in_win_grp(MPID_Win * win_ptr, int *ranks_in_win_grp)
-{
-    int mpi_errno = MPI_SUCCESS;
-    int i, *ranks_in_start_grp;
-    MPID_Group *win_grp_ptr;
-    MPIU_CHKLMEM_DECL(2);
-    MPIDI_STATE_DECL(MPID_STATE_FILL_RANKS_IN_WIN_GRP);
-
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_FILL_RANKS_IN_WIN_GRP);
-
-    MPIU_CHKLMEM_MALLOC(ranks_in_start_grp, int *, win_ptr->start_group_ptr->size * sizeof(int),
-                        mpi_errno, "ranks_in_start_grp");
-
-    for (i = 0; i < win_ptr->start_group_ptr->size; i++)
-        ranks_in_start_grp[i] = i;
-
-    mpi_errno = MPIR_Comm_group_impl(win_ptr->comm_ptr, &win_grp_ptr);
-    if (mpi_errno) {
-        MPIU_ERR_POP(mpi_errno);
+        if (win_ptr->shm_allocated == TRUE) {
+            /* Ensure ordering of load/store operations */
+            OPA_read_write_barrier();
+        }
     }
 
-    mpi_errno =
-        MPIR_Group_translate_ranks_impl(win_ptr->start_group_ptr, win_ptr->start_group_ptr->size,
-                                        ranks_in_start_grp, win_grp_ptr, ranks_in_win_grp);
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
+    win_ptr->states.access_state = MPIDI_RMA_PSCW_ISSUED;
+    num_active_issued_win++;
 
-    mpi_errno = MPIR_Group_free_impl(win_grp_ptr);
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
+    MPIU_Assert(win_ptr->posted_ops_cnt == 0);
+    MPIU_Assert(win_ptr->active_req_cnt == 0);
 
-  fn_fail:
-    MPIU_CHKLMEM_FREEALL();
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_FILL_RANKS_IN_WIN_GRP);
-    return mpi_errno;
-}
-
-
-#undef FUNCNAME
-#define FUNCNAME MPIDI_Win_start
-#undef FCNAME
-#define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_Win_start(MPID_Group * group_ptr, int assert, MPID_Win * win_ptr)
-{
-    int mpi_errno = MPI_SUCCESS;
-    int *ranks_in_win_grp;
-    MPIU_CHKLMEM_DECL(1);
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_START);
-
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_START);
-
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_NONE &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_POST,
-                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
-
-    /* Track access epoch state */
-    if (win_ptr->epoch_state == MPIDI_EPOCH_POST)
-        win_ptr->epoch_state = MPIDI_EPOCH_PSCW;
-    else
-        win_ptr->epoch_state = MPIDI_EPOCH_START;
-
-    /* Even though we would want to reset the fence counter to keep
-     * the user from using the previous fence to mark the beginning of
-     * a fence epoch if he switched from fence to lock-unlock
-     * synchronization, we cannot do this because fence_issued must be
-     * updated collectively */
-
-    win_ptr->start_group_ptr = group_ptr;
-    MPIR_Group_add_ref(group_ptr);
-    win_ptr->start_assert = assert;
-
-    /* wait for messages from local processes */
-    MPIU_CHKLMEM_MALLOC(ranks_in_win_grp, int *, win_ptr->start_group_ptr->size * sizeof(int),
-                        mpi_errno, "ranks_in_win_grp");
-
-    mpi_errno = fill_ranks_in_win_grp(win_ptr, ranks_in_win_grp);
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
-
-    /* If MPI_MODE_NOCHECK was not specified, we need to check if
-       Win_post was called on the target processes on SHM window.
-       Wait for a 0-byte sync message from each target process. */
-    if ((win_ptr->start_assert & MPI_MODE_NOCHECK) == 0)
-    {
-        mpi_errno = recv_post_msgs(win_ptr, ranks_in_win_grp, 1);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-    }
-
-    /* Ensure ordering of load/store operations */
-    if (win_ptr->shm_allocated == TRUE) {
-        OPA_read_write_barrier();
-    }
-
-  fn_fail:
+ fn_exit:
     MPIU_CHKLMEM_FREEALL();
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_START);
     return mpi_errno;
+ fn_fail:
+    MPIU_CHKPMEM_REAP();
+    goto fn_exit;
 }
 
 
@@ -927,197 +757,98 @@ int MPIDI_Win_start(MPID_Group * group_ptr, int assert, MPID_Win * win_ptr)
 int MPIDI_Win_complete(MPID_Win * win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
-    int comm_size, *nops_to_proc, new_total_op_count;
-    int i, j, dst, total_op_count, *curr_ops_cnt;
-    MPIDI_RMA_Op_t *curr_ptr;
-    MPIDI_RMA_Ops_list_t *ops_list;
-    MPIDI_RMA_Ops_list_t *ops_list_tail;
-    MPID_Comm *comm_ptr;
-    int start_grp_size, *ranks_in_win_grp, rank;
-    int nRequest = 0;
-    int nRequestNew = 0;
-    MPIU_CHKLMEM_DECL(6);
+    int i, dst, rank = win_ptr->comm_ptr->rank;
+    int local_completed = 0, remote_completed = 0;
+    MPID_Comm *win_comm_ptr = win_ptr->comm_ptr;
+    MPIDI_RMA_Target_t *curr_target;
+    int made_progress;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_COMPLETE);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_COMPLETE);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_PSCW &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_START,
+    /* Access epochs on the same window must be disjoint. */
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_PSCW_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_PSCW_GRANTED,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    /* Track access epoch state */
-    if (win_ptr->epoch_state == MPIDI_EPOCH_PSCW)
-        win_ptr->epoch_state = MPIDI_EPOCH_POST;
-    else
-        win_ptr->epoch_state = MPIDI_EPOCH_NONE;
-
-    comm_ptr = win_ptr->comm_ptr;
-    comm_size = comm_ptr->local_size;
-
-    /* Ensure ordering of load/store operations. */
-    if (win_ptr->shm_allocated == TRUE) {
-        OPA_read_write_barrier();
-    }
-
-    /* Translate the ranks of the processes in
-     * start_group to ranks in win_ptr->comm_ptr */
-
-    start_grp_size = win_ptr->start_group_ptr->size;
-
-    MPIU_CHKLMEM_MALLOC(ranks_in_win_grp, int *, start_grp_size * sizeof(int),
-                        mpi_errno, "ranks_in_win_grp");
-
-    mpi_errno = fill_ranks_in_win_grp(win_ptr, ranks_in_win_grp);
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
-
-    rank = win_ptr->comm_ptr->rank;
-
-    /* If MPI_MODE_NOCHECK was not specified, we need to check if
-     * Win_post was called on the target processes. Wait for a 0-byte sync
-     * message from each target process */
-    if ((win_ptr->start_assert & MPI_MODE_NOCHECK) == 0) {
-        /* wait for messages from non-local processes */
-        mpi_errno = recv_post_msgs(win_ptr, ranks_in_win_grp, 0);
-        if (mpi_errno)
-            MPIU_ERR_POP(mpi_errno);
-    }
-
-    /* keep track of no. of ops to each proc. Needed for knowing
-     * whether or not to decrement the completion counter. The
-     * completion counter is decremented only on the last
-     * operation. */
-
-    /* Note, active target uses the following ops list, and passive
-     * target uses win_ptr->targets[..] */
-    ops_list = &win_ptr->at_rma_ops_list;
-    ops_list_tail = &win_ptr->at_rma_ops_list_tail;
-
-    MPIU_CHKLMEM_MALLOC(nops_to_proc, int *, comm_size * sizeof(int), mpi_errno, "nops_to_proc");
-    for (i = 0; i < comm_size; i++)
-        nops_to_proc[i] = 0;
-
-    total_op_count = 0;
-    curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
-    while (curr_ptr != NULL) {
-        nops_to_proc[curr_ptr->target_rank]++;
-        total_op_count++;
-        curr_ptr = curr_ptr->next;
+    if (win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED) {
+        while (win_ptr->states.access_state != MPIDI_RMA_PSCW_GRANTED) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
     }
 
-    /* We allocate a few extra requests because if there are no RMA
-     * ops to a target process, we need to send a 0-byte message just
-     * to decrement the completion counter. */
-
-    MPIU_CHKLMEM_MALLOC(curr_ops_cnt, int *, comm_size * sizeof(int), mpi_errno, "curr_ops_cnt");
-    for (i = 0; i < comm_size; i++)
-        curr_ops_cnt[i] = 0;
-
-    i = 0;
-    curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
-    while (curr_ptr != NULL) {
-        MPIDI_CH3_Pkt_flags_t flags = MPIDI_CH3_PKT_FLAG_NONE;
-
-        /* The completion counter at the target is decremented only on
-         * the last RMA operation. */
-        if (curr_ops_cnt[curr_ptr->target_rank] == nops_to_proc[curr_ptr->target_rank] - 1) {
-            flags = MPIDI_CH3_PKT_FLAG_RMA_AT_COMPLETE;
+    for (i = 0; i < win_ptr->start_grp_size; i++) {
+        dst = win_ptr->start_ranks_in_win_grp[i];
+        if (dst == rank) {
+            win_ptr->at_completion_counter--;
+            MPIU_Assert(win_ptr->at_completion_counter >= 0);
+            continue;
         }
 
-        mpi_errno = MPIDI_CH3I_Issue_rma_op(curr_ptr, win_ptr, flags);
-        if (mpi_errno)
-            MPIU_ERR_POP(mpi_errno);
+        if (win_comm_ptr->local_size <= win_ptr->num_slots)
+            curr_target = win_ptr->slots[dst].target_list;
+        else {
+            curr_target = win_ptr->slots[dst % win_ptr->num_slots].target_list;
+            while (curr_target != NULL && curr_target->target_rank != dst)
+                curr_target = curr_target->next;
+        }
 
-        i++;
-        curr_ops_cnt[curr_ptr->target_rank]++;
-        /* If the request is null, we can remove it immediately */
-        if (!curr_ptr->request) {
-            MPIDI_CH3I_RMA_Ops_free_and_next(win_ptr, ops_list, ops_list_tail, &curr_ptr);
+        if (curr_target != NULL) {
+            /* set sync_flag in sync struct */
+            if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
+                curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
+                curr_target->sync.have_remote_incomplete_ops = 0;
+                curr_target->sync.outstanding_acks++;
+            }
+            curr_target->win_complete_flag = 1;
         }
         else {
-            nRequest++;
-            curr_ptr = curr_ptr->next;
-            if (nRequest > MPIR_CVAR_CH3_RMA_NREQUEST_THRESHOLD &&
-                nRequest - nRequestNew > MPIR_CVAR_CH3_RMA_NREQUEST_NEW_THRESHOLD) {
-                int nDone = 0;
-                mpi_errno = poke_progress_engine();
-                if (mpi_errno != MPI_SUCCESS)
-                    MPIU_ERR_POP(mpi_errno);
-                mpi_errno = rma_list_gc(win_ptr, ops_list, ops_list_tail, curr_ptr, &nDone);
-                if (mpi_errno != MPI_SUCCESS)
-                    MPIU_ERR_POP(mpi_errno);
-                nRequest -= nDone;
-                nRequestNew = nRequest;
-            }
+            /* FIXME: do we need to wait for remote completion? */
+            mpi_errno = send_decr_at_cnt_msg(dst, win_ptr);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
         }
     }
 
-    /* If the start_group included some processes that did not end up
-     * becoming targets of  RMA operations from this process, we need
-     * to send a dummy message to those processes just to decrement
-     * the completion counter */
+    /* issue out all operations */
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-    j = i;
-    new_total_op_count = total_op_count;
-    for (i = 0; i < start_grp_size; i++) {
-        dst = ranks_in_win_grp[i];
-        if (dst == rank) {
-            /* FIXME: MT: this has to be done atomically */
-            win_ptr->at_completion_counter -= 1;
+    /* wait until all slots are empty */
+    do {
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
+                                                   &remote_completed);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (!remote_completed) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
         }
-        else if (nops_to_proc[dst] == 0) {
-            MPIDI_CH3_Pkt_t upkt;
-            MPIDI_CH3_Pkt_put_t *put_pkt = &upkt.put;
-            MPIDI_VC_t *vc;
-            MPID_Request *request;
-
-            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
-            put_pkt->flags = MPIDI_CH3_PKT_FLAG_RMA_AT_COMPLETE;
-            put_pkt->addr = NULL;
-            put_pkt->count = 0;
-            put_pkt->datatype = MPI_INT;
-            put_pkt->target_win_handle = win_ptr->all_win_handles[dst];
-            put_pkt->source_win_handle = win_ptr->handle;
-
-            MPIDI_Comm_get_vc_set_active(comm_ptr, dst, &vc);
-
-            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-            mpi_errno = MPIDI_CH3_iStartMsg(vc, put_pkt, sizeof(*put_pkt), &request);
-            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-            if (mpi_errno != MPI_SUCCESS) {
-                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-            }
-            /* In the unlikely event that a request is returned (the message
-             * is not sent yet), add it to the list of pending operations */
-            if (request) {
-                MPIDI_RMA_Op_t *new_ptr = NULL;
+    } while (!remote_completed);
 
-                mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
-                if (mpi_errno) {
-                    MPIU_ERR_POP(mpi_errno);
-                }
+    /* Cleanup all targets on this window. */
+    mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-                new_ptr->request = request;
-            }
-            j++;
-            new_total_op_count++;
-        }
-    }
+    MPIU_Assert(win_ptr->non_empty_slots == 0);
 
-    if (new_total_op_count) {
-        mpi_errno = rma_list_complete(win_ptr, ops_list, ops_list_tail);
-        if (mpi_errno != MPI_SUCCESS)
-            MPIU_ERR_POP(mpi_errno);
+    /* Ensure ordering of load/store operations. */
+    if (win_ptr->shm_allocated == TRUE) {
+        OPA_read_write_barrier();
     }
 
-    MPIU_Assert(MPIDI_CH3I_RMA_Ops_isempty(ops_list));
+    /* free start group stored in window */
+    MPIU_Free(win_ptr->start_ranks_in_win_grp);
+    win_ptr->start_ranks_in_win_grp = NULL;
+
+    win_ptr->posted_ops_cnt = 0;
+    MPIU_Assert(win_ptr->active_req_cnt == 0);
+    MPIU_Assert(win_ptr->start_req == NULL);
 
-    /* free the group stored in window */
-    MPIR_Group_release(win_ptr->start_group_ptr);
-    win_ptr->start_group_ptr = NULL;
+    win_ptr->states.access_state = MPIDI_RMA_NONE;
 
   fn_exit:
-    MPIU_CHKLMEM_FREEALL();
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_COMPLETE);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
@@ -1135,37 +866,18 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
 int MPIDI_Win_wait(MPID_Win * win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
-
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_WAIT);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_WAIT);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_PSCW &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_POST,
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.exposure_state != MPIDI_RMA_PSCW_EXPO,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    /* Track access epoch state */
-    if (win_ptr->epoch_state == MPIDI_EPOCH_PSCW)
-        win_ptr->epoch_state = MPIDI_EPOCH_START;
-    else
-        win_ptr->epoch_state = MPIDI_EPOCH_NONE;
-
     /* wait for all operations from other processes to finish */
-    if (win_ptr->at_completion_counter) {
-        MPID_Progress_state progress_state;
-
-        MPID_Progress_start(&progress_state);
-        while (win_ptr->at_completion_counter) {
-            mpi_errno = MPID_Progress_wait(&progress_state);
-            /* --BEGIN ERROR HANDLING-- */
-            if (mpi_errno != MPI_SUCCESS) {
-                MPID_Progress_end(&progress_state);
-                MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_WAIT);
-                return mpi_errno;
-            }
-            /* --END ERROR HANDLING-- */
-        }
-        MPID_Progress_end(&progress_state);
+    while (win_ptr->at_completion_counter) {
+        mpi_errno = wait_progress_engine();
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     /* Ensure ordering of load/store operations. */
@@ -1173,6 +885,8 @@ int MPIDI_Win_wait(MPID_Win * win_ptr)
         OPA_read_write_barrier();
     }
 
+    win_ptr->states.exposure_state = MPIDI_RMA_NONE;
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_WAIT);
     return mpi_errno;
@@ -1190,33 +904,26 @@ int MPIDI_Win_wait(MPID_Win * win_ptr)
 int MPIDI_Win_test(MPID_Win * win_ptr, int *flag)
 {
     int mpi_errno = MPI_SUCCESS;
-
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_TEST);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_TEST);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_PSCW &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_POST,
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.exposure_state != MPIDI_RMA_PSCW_EXPO,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
     mpi_errno = MPID_Progress_test();
     if (mpi_errno != MPI_SUCCESS) {
-        MPIU_ERR_POP(mpi_errno);
+	MPIU_ERR_POP(mpi_errno);
     }
 
     *flag = (win_ptr->at_completion_counter) ? 0 : 1;
-
     if (*flag) {
-        /* Track access epoch state */
-        if (win_ptr->epoch_state == MPIDI_EPOCH_PSCW)
-            win_ptr->epoch_state = MPIDI_EPOCH_START;
-        else
-            win_ptr->epoch_state = MPIDI_EPOCH_NONE;
-
         /* Ensure ordering of load/store operations. */
         if (win_ptr->shm_allocated == TRUE) {
             OPA_read_write_barrier();
         }
+
+        win_ptr->states.exposure_state = MPIDI_RMA_NONE;
     }
 
   fn_exit:
@@ -1235,90 +942,86 @@ int MPIDI_Win_test(MPID_Win * win_ptr, int *flag)
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_Win_lock(int lock_type, int dest, int assert, MPID_Win * win_ptr)
 {
+    int made_progress = 0;
+    int shm_target = FALSE;
+    int rank = win_ptr->comm_ptr->rank;
+    MPIDI_RMA_Target_t *target = NULL;
+    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
     int mpi_errno = MPI_SUCCESS;
-    struct MPIDI_Win_target_state *target_state;
-    MPIDI_VC_t *orig_vc, *target_vc;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_LOCK);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_LOCK);
 
-    MPIU_UNREFERENCED_ARG(assert);
-
-    /* Even though we would want to reset the fence counter to keep
-     * the user from using the previous fence to mark the beginning of
-     * a fence epoch if he switched from fence to lock-unlock
-     * synchronization, we cannot do this because fence_issued must be
-     * updated collectively */
+    /* Note that here we cannot distinguish if this access epoch is overlapped
+       with an access epoch of FENCE (which is not allowed), since FENCE may be
+       ended up with not unsetting the window state. We can only detect if this
+       access epoch is overlapped with another access epoch of PSCW or Passive
+       Target. */
+    if (win_ptr->lock_epoch_count == 0) {
+        MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_NONE &&
+                            win_ptr->states.access_state != MPIDI_RMA_FENCE_ISSUED &&
+                            win_ptr->states.access_state != MPIDI_RMA_FENCE_GRANTED,
+                            mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+    }
+    else {
+        MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_NONE &&
+                            win_ptr->states.access_state != MPIDI_RMA_FENCE_ISSUED &&
+                            win_ptr->states.access_state != MPIDI_RMA_FENCE_GRANTED &&
+                            win_ptr->states.access_state != MPIDI_RMA_PER_TARGET,
+                            mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+    }
 
-    if (dest == MPI_PROC_NULL)
-        goto fn_exit;
+    if (dest != MPI_PROC_NULL) {
+        /* check if we lock the same target window more than once. */
+        mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, dest, &target);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        MPIU_ERR_CHKANDJUMP(target != NULL, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+    }
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_NONE &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_LOCK,
-                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+    /* Error handling is finished. */
 
-    target_state = &win_ptr->targets[dest];
+    if (win_ptr->lock_epoch_count == 0) {
+        win_ptr->states.access_state = MPIDI_RMA_PER_TARGET;
+        num_passive_win++;
+    }
+    win_ptr->lock_epoch_count++;
 
-    /* Check if a lock has already been issued */
-    MPIU_ERR_CHKANDJUMP(target_state->remote_lock_state != MPIDI_CH3_WIN_LOCK_NONE,
-                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+    if (dest == MPI_PROC_NULL)
+        goto fn_exit;
 
-    /* Track access epoch state */
-    if (win_ptr->epoch_state != MPIDI_EPOCH_LOCK_ALL) {
-        win_ptr->epoch_count++;
-        win_ptr->epoch_state = MPIDI_EPOCH_LOCK;
+    if (win_ptr->shm_allocated == TRUE) {
+        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
+        MPIDI_Comm_get_vc(win_ptr->comm_ptr, dest, &target_vc);
+        if (orig_vc->node_id == target_vc->node_id)
+            shm_target = TRUE;
     }
 
-    target_state->remote_lock_state = MPIDI_CH3_WIN_LOCK_CALLED;
-    target_state->remote_lock_mode = lock_type;
-    target_state->remote_lock_assert = assert;
+    /* Create a new target. */
+    mpi_errno = MPIDI_CH3I_Win_create_target(win_ptr, dest, &target);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-    if (dest == win_ptr->comm_ptr->rank) {
-        /* The target is this process itself. We must block until the lock
-         * is acquired.  Once it is acquired, local puts, gets, accumulates
-         * will be done directly without queueing. */
-        mpi_errno = acquire_local_lock(win_ptr, lock_type);
-        if (mpi_errno) {
+    /* Store lock_state (CALLED/ISSUED/GRANTED), lock_type (SHARED/EXCLUSIVE),
+       lock_mode (MODE_NOCHECK). */
+    if (assert & MPI_MODE_NOCHECK)
+        target->access_state = MPIDI_RMA_LOCK_GRANTED;
+    else
+        target->access_state = MPIDI_RMA_LOCK_CALLED;
+    target->lock_type = lock_type;
+    target->lock_mode = assert;
+
+    /* If Destination is myself or process on SHM, acquire the lock,
+       wait until lock is granted. */
+    if (!(assert & MPI_MODE_NOCHECK) && (dest == rank || shm_target)) {
+        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest, &made_progress);
+        if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
-        }
-    }
-    else if (win_ptr->shm_allocated == TRUE) {
-        /* Lock must be taken immediately for shared memory windows because of
-         * load/store access */
-
-        if (win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
-            /* check if target is local and shared memory is allocated on window,
-             * if so, we directly send lock request and wait for lock reply. */
-
-            /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
-             * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
-             * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
-             * which is only set to TRUE when SHM region is allocated in nemesis.
-             * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
-             */
-            MPIDI_Comm_get_vc(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &orig_vc);
-            MPIDI_Comm_get_vc(win_ptr->comm_ptr, dest, &target_vc);
-        }
-
-        if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
-            orig_vc->node_id == target_vc->node_id) {
-            mpi_errno = send_lock_msg(dest, lock_type, win_ptr);
-            if (mpi_errno) {
-                MPIU_ERR_POP(mpi_errno);
-            }
 
-            mpi_errno = wait_for_lock_granted(win_ptr, dest);
-            if (mpi_errno) {
+        while (target->access_state != MPIDI_RMA_LOCK_GRANTED) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-            }
         }
     }
-    else if (MPIR_CVAR_CH3_RMA_LOCK_IMMED && ((assert & MPI_MODE_NOCHECK) == 0)) {
-        /* TODO: Make this mode of operation available through an assert
-         * argument or info key. */
-        mpi_errno = send_lock_msg(dest, lock_type, win_ptr);
-        MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
-    }
 
     /* Ensure ordering of load/store operations. */
     if (win_ptr->shm_allocated == TRUE) {
@@ -1338,172 +1041,95 @@ int MPIDI_Win_lock(int lock_type, int dest, int assert, MPID_Win * win_ptr)
 #define FUNCNAME MPIDI_Win_unlock
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_Win_unlock(int dest, MPID_Win * win_ptr)
+int MPIDI_Win_unlock(int dest, MPID_Win *win_ptr)
 {
+    int made_progress = 0;
+    int local_completed = 0, remote_completed = 0;
+    MPIDI_RMA_Target_t *target = NULL;
+    enum MPIDI_RMA_sync_types sync_flag;
     int mpi_errno = MPI_SUCCESS;
-    int single_op_opt = 0;
-    MPIDI_RMA_Op_t *rma_op;
-    int wait_for_rma_done_pkt = 0;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_UNLOCK);
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_UNLOCK);
 
-    if (dest == MPI_PROC_NULL)
-        goto fn_exit;
-
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_LOCK &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_LOCK_ALL,
-                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_UNLOCK);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->targets[dest].remote_lock_state == MPIDI_CH3_WIN_LOCK_NONE,
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_PER_TARGET,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    /* Track access epoch state */
-    if (win_ptr->epoch_state == MPIDI_EPOCH_LOCK) {
-        win_ptr->epoch_count--;
-        if (win_ptr->epoch_count == 0)
-            win_ptr->epoch_state = MPIDI_EPOCH_NONE;
-    }
-
     /* Ensure ordering of load/store operations. */
-    if (win_ptr->shm_allocated == TRUE) {
+    if (win_ptr->shm_allocated) {
         OPA_read_write_barrier();
     }
 
-    if (dest == win_ptr->comm_ptr->rank) {
-        /* local lock. release the lock on the window, grant the next one
-         * in the queue, and return. */
-        MPIU_Assert(MPIDI_CH3I_RMA_Ops_isempty(&win_ptr->targets[dest].rma_ops_list));
-
-        /* NOTE: We don't need to signal completion here becase a thread in the
-         * same processes cannot lock the window again while it is already
-         * locked. */
-        mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
-        if (mpi_errno != MPI_SUCCESS) {
-            MPIU_ERR_POP(mpi_errno);
-        }
-        win_ptr->targets[dest].remote_lock_state = MPIDI_CH3_WIN_LOCK_NONE;
+    if (dest == MPI_PROC_NULL)
+        goto finish_unlock;
+
+    /* When the process tries to acquire the lock on itself, it does not
+       go through the progress engine. Therefore, it is possible that
+       one process always grants the lock to itself but never process
+       events coming from other processes. This may cause deadlock in
+       applications where the program execution on target process depends
+       on the happening of events from other processes. Here we poke
+       the progress engine once to avoid such issue.  */
+    mpi_errno = poke_progress_engine();
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
-        mpi_errno = MPID_Progress_poke();
-        if (mpi_errno != MPI_SUCCESS) {
+    /* Find or recreate target. */
+    mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, dest, &target);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+    if (target == NULL) {
+        mpi_errno = MPIDI_CH3I_Win_create_target(win_ptr, dest, &target);
+        if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
-        }
-        goto fn_exit;
+        target->access_state = MPIDI_RMA_LOCK_GRANTED;
     }
 
-    rma_op = MPIDI_CH3I_RMA_Ops_head(&win_ptr->targets[dest].rma_ops_list);
-
-    /* Lock was called, but the lock was not requested and there are no ops to
-     * perform.  Do nothing and return. */
-    if (rma_op == NULL && win_ptr->targets[dest].remote_lock_state == MPIDI_CH3_WIN_LOCK_CALLED) {
-        win_ptr->targets[dest].remote_lock_state = MPIDI_CH3_WIN_LOCK_NONE;
-        goto fn_exit;
-    }
-
-    /* TODO: MPI-3: Add lock->cas/fop/gacc->unlock optimization.  */
-    /* TODO: MPI-3: Add lock_all->op optimization. */
-    /* LOCK-OP-UNLOCK Optimization -- This optimization can't be used if we
-     * have already requested the lock. */
-    if (MPIR_CVAR_CH3_RMA_MERGE_LOCK_OP_UNLOCK &&
-        win_ptr->targets[dest].remote_lock_state == MPIDI_CH3_WIN_LOCK_CALLED &&
-        rma_op && rma_op->next == NULL /* There is only one op */  &&
-        rma_op->pkt.type != MPIDI_CH3_PKT_CAS &&
-        rma_op->pkt.type != MPIDI_CH3_PKT_FOP && rma_op->pkt.type != MPIDI_CH3_PKT_GET_ACCUM) {
-        /* Single put, get, or accumulate between the lock and unlock. If it
-         * is of small size and predefined datatype at the target, we
-         * do an optimization where the lock and the RMA operation are
-         * sent in a single packet. Otherwise, we send a separate lock
-         * request first. */
-        MPI_Aint type_size;
-        MPIDI_VC_t *vc;
-        MPIDI_RMA_Op_t *curr_op = rma_op;
-        MPI_Datatype target_datatype;
-
-        MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr, dest, &vc);
-
-        MPID_Datatype_get_size_macro(curr_op->origin_datatype, type_size);
-
-        /* msg_sz typically = 65480 */
-        MPIDI_CH3_PKT_RMA_GET_TARGET_DATATYPE(curr_op->pkt, target_datatype, mpi_errno);
-        if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
-            (type_size * curr_op->origin_count <= vc->eager_max_msg_sz)) {
-            single_op_opt = 1;
-            /* Set the lock granted flag to 1 */
-            win_ptr->targets[dest].remote_lock_state = MPIDI_CH3_WIN_LOCK_GRANTED;
-            if (curr_op->pkt.type == MPIDI_CH3_PKT_GET) {
-                mpi_errno = send_lock_get(win_ptr, dest);
-                wait_for_rma_done_pkt = 0;
-            }
-            else {
-                mpi_errno = send_lock_put_or_acc(win_ptr, dest);
-                wait_for_rma_done_pkt = 1;
-            }
-            if (mpi_errno) {
-                MPIU_ERR_POP(mpi_errno);
-            }
-        }
+    /* Set sync_flag in sync struct. */
+    if (target->lock_mode & MPI_MODE_NOCHECK)
+        sync_flag = MPIDI_RMA_SYNC_FLUSH;
+    else
+        sync_flag = MPIDI_RMA_SYNC_UNLOCK;
+    if (target->sync.sync_flag < sync_flag) {
+        target->sync.sync_flag = sync_flag;
+        target->sync.have_remote_incomplete_ops = 0;
+        target->sync.outstanding_acks++;
     }
 
-    if (single_op_opt == 0) {
-
-        /* Send a lock packet over to the target and wait for the lock_granted
-         * reply. If the user gave MODE_NOCHECK, we will piggyback the lock
-         * request on the first RMA op.  Then do all the RMA ops. */
-
-        if ((win_ptr->targets[dest].remote_lock_assert & MPI_MODE_NOCHECK) == 0) {
-            if (win_ptr->targets[dest].remote_lock_state == MPIDI_CH3_WIN_LOCK_CALLED) {
-                mpi_errno = send_lock_msg(dest, win_ptr->targets[dest].remote_lock_mode, win_ptr);
-                if (mpi_errno) {
-                    MPIU_ERR_POP(mpi_errno);
-                }
-            }
-        }
+    /* Issue out all operations. */
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest,
+                                                    &made_progress);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
-        if (win_ptr->targets[dest].remote_lock_state == MPIDI_CH3_WIN_LOCK_REQUESTED) {
-            mpi_errno = wait_for_lock_granted(win_ptr, dest);
-            if (mpi_errno) {
+    /* Wait for remote completion. */
+    do {
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, target,
+                                                      &local_completed,
+                                                      &remote_completed);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+        if (!remote_completed) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
-            }
         }
+    } while (!remote_completed);
 
-        /* Now do all the RMA operations */
-        mpi_errno = do_passive_target_rma(win_ptr, dest, &wait_for_rma_done_pkt,
-                                          MPIDI_CH3_PKT_FLAG_RMA_UNLOCK);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
-    }
+    /* Cleanup the target. */
+    mpi_errno = MPIDI_CH3I_RMA_Cleanup_single_target(win_ptr, target);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
-    /* If the lock is a shared lock or we have done the single op
-     * optimization, we need to wait until the target informs us that
-     * all operations are done on the target.  This ensures that third-
-     * party communication can be done safely.  */
-    if (wait_for_rma_done_pkt == 1) {
-        /* wait until the "pt rma done" packet is received from the
-         * target. This packet resets the remote_lock_state flag back to
-         * NONE. */
-
-        /* poke the progress engine until remote_lock_state flag is reset to NONE */
-        if (win_ptr->targets[dest].remote_lock_state != MPIDI_CH3_WIN_LOCK_NONE) {
-            MPID_Progress_state progress_state;
+ finish_unlock:
+    win_ptr->posted_ops_cnt = 0;
+    MPIU_Assert(win_ptr->active_req_cnt == 0);
 
-            MPID_Progress_start(&progress_state);
-            while (win_ptr->targets[dest].remote_lock_state != MPIDI_CH3_WIN_LOCK_NONE) {
-                mpi_errno = MPID_Progress_wait(&progress_state);
-                /* --BEGIN ERROR HANDLING-- */
-                if (mpi_errno != MPI_SUCCESS) {
-                    MPID_Progress_end(&progress_state);
-                    MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winnoprogress");
-                }
-                /* --END ERROR HANDLING-- */
-            }
-            MPID_Progress_end(&progress_state);
-        }
+    win_ptr->lock_epoch_count--;
+    if (win_ptr->lock_epoch_count == 0) {
+        win_ptr->states.access_state = MPIDI_RMA_NONE;
+        num_passive_win--;
+        MPIU_Assert(num_passive_win >= 0);
     }
-    else {
-        win_ptr->targets[dest].remote_lock_state = MPIDI_CH3_WIN_LOCK_NONE;
-    }
-
-    MPIU_Assert(MPIDI_CH3I_RMA_Ops_isempty(&win_ptr->targets[dest].rma_ops_list));
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_UNLOCK);
@@ -1521,40 +1147,65 @@ int MPIDI_Win_unlock(int dest, MPID_Win * win_ptr)
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_Win_flush_all(MPID_Win * win_ptr)
 {
+    int i, made_progress = 0;
+    int local_completed = 0, remote_completed = 0;
+    MPIDI_RMA_Target_t *curr_target = NULL;
     int mpi_errno = MPI_SUCCESS;
-    int i;
     MPIDI_STATE_DECL(MPIDI_STATE_MPIDI_WIN_FLUSH_ALL);
 
     MPIDI_RMA_FUNC_ENTER(MPIDI_STATE_MPIDI_WIN_FLUSH_ALL);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_LOCK &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_LOCK_ALL,
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_PER_TARGET &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_CALLED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    /* FIXME: Performance -- we should not process the ops separately.
-     * Ideally, we should be able to use the same infrastructure that's used by
-     * active target to complete all operations. */
+    /* Ensure ordering of load/store operations. */
+    if (win_ptr->shm_allocated == TRUE) {
+        OPA_read_write_barrier();
+    }
 
-    /* Note: Local RMA calls don't poke the progress engine.  This routine
-     * should poke the progress engine when the local target is flushed to help
-     * make asynchronous progress.  Currently this is handled by Win_flush().
-     */
-    for (i = 0; i < MPIR_Comm_size(win_ptr->comm_ptr); i++) {
-        if (MPIDI_CH3I_RMA_Ops_head(&win_ptr->targets[i].rma_ops_list) == NULL)
-            continue;
-        if (win_ptr->targets[i].remote_lock_state != MPIDI_CH3_WIN_LOCK_NONE) {
-            mpi_errno = win_ptr->RMAFns.Win_flush(i, win_ptr);
-            if (mpi_errno != MPI_SUCCESS) {
-                MPIU_ERR_POP(mpi_errno);
+    /* When the process tries to acquire the lock on itself, it does not
+       go through the progress engine. Therefore, it is possible that
+       one process always grants the lock to itself but never process
+       events coming from other processes. This may cause deadlock in
+       applications where the program execution on target process depends
+       on the happening of events from other processes. Here we poke
+       the progress engine once to avoid such issue.  */
+    mpi_errno = poke_progress_engine();
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+
+    /* Set sync_flag in sync struct. */
+    for (i = 0; i < win_ptr->num_slots; i++) {
+        curr_target = win_ptr->slots[i].target_list;
+        while (curr_target != NULL) {
+            if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
+                curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
+                curr_target->sync.have_remote_incomplete_ops = 0;
+                curr_target->sync.outstanding_acks++;
             }
+            curr_target = curr_target->next;
         }
     }
 
-    /* Ensure that all shared memory operations are flushed out.  The memory
-     * barriers in the flush are not sufficient since we skip calling flush
-     * when all operations are already completed. */
-    if (win_ptr->shm_allocated == TRUE)
-        OPA_read_write_barrier();
+    /* Issue out all operations. */
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+
+    /* Wait for remote completion. */
+    do {
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
+                                                   &remote_completed);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (!remote_completed) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
+    } while (!remote_completed);
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPIDI_STATE_MPIDI_WIN_FLUSH_ALL);
@@ -1570,122 +1221,82 @@ int MPIDI_Win_flush_all(MPID_Win * win_ptr)
 #define FUNCNAME MPIDI_Win_flush
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_Win_flush(int rank, MPID_Win * win_ptr)
+int MPIDI_Win_flush(int dest, MPID_Win *win_ptr)
 {
+    int made_progress = 0;
+    int local_completed = 0, remote_completed = 0;
+    int rank = win_ptr->comm_ptr->rank;
+    MPIDI_RMA_Target_t *target = NULL;
     int mpi_errno = MPI_SUCCESS;
-    int wait_for_rma_done_pkt = 0;
-    MPIDI_RMA_Op_t *rma_op;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FLUSH);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FLUSH);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_LOCK &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_LOCK_ALL,
-                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
-
-    /* Check if win_lock was called */
-    MPIU_ERR_CHKANDJUMP(win_ptr->targets[rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_NONE,
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_PER_TARGET &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_CALLED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    /* Ensure ordering of read/write operations */
-    if (win_ptr->shm_allocated == TRUE) {
+    /* Ensure ordering of load/store operations. */
+    if (win_ptr->shm_allocated) {
         OPA_read_write_barrier();
     }
 
-    /* Local flush: ops are performed immediately on the local process */
-    if (rank == win_ptr->comm_ptr->rank) {
-        MPIU_Assert(win_ptr->targets[rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_GRANTED);
-        MPIU_Assert(MPIDI_CH3I_RMA_Ops_isempty(&win_ptr->targets[rank].rma_ops_list));
+    /* When the process tries to acquire the lock on itself, it does not
+       go through the progress engine. Therefore, it is possible that
+       one process always grants the lock to itself but never process
+       events coming from other processes. This may cause deadlock in
+       applications where the program execution on target process depends
+       on the happening of events from other processes. Here we poke
+       the progress engine once to avoid such issue.  */
+    mpi_errno = poke_progress_engine();
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
-        /* If flush is used as a part of polling for incoming data, we can
-         * deadlock, since local RMA calls never poke the progress engine.  So,
-         * make extra progress here to avoid this problem. */
-        mpi_errno = MPIDI_CH3_Progress_poke();
-        if (mpi_errno)
-            MPIU_ERR_POP(mpi_errno);
+    if (rank == dest)
         goto fn_exit;
-    }
-
-    /* NOTE: All flush and req-based operations are currently implemented in
-     * terms of MPIDI_Win_flush.  When this changes, those operations will also
-     * need to insert this read/write memory fence for shared memory windows. */
-
-    rma_op = MPIDI_CH3I_RMA_Ops_head(&win_ptr->targets[rank].rma_ops_list);
 
-    /* If there is no activity at this target (e.g. lock-all was called, but we
-     * haven't communicated with this target), don't do anything. */
-    if (win_ptr->targets[rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_CALLED && rma_op == NULL) {
-        goto fn_exit;
+    if (win_ptr->shm_allocated) {
+        MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+        MPIDI_Comm_get_vc(win_ptr->comm_ptr, dest, &target_vc);
+        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
+        if (orig_vc->node_id == target_vc->node_id)
+            goto fn_exit;
     }
 
-    /* MT: If another thread is performing a flush, wait for them to finish. */
-    if (win_ptr->targets[rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_FLUSH) {
-        MPID_Progress_state progress_state;
-
-        MPID_Progress_start(&progress_state);
-        while (win_ptr->targets[rank].remote_lock_state != MPIDI_CH3_WIN_LOCK_GRANTED) {
-            mpi_errno = MPID_Progress_wait(&progress_state);
-            /* --BEGIN ERROR HANDLING-- */
-            if (mpi_errno != MPI_SUCCESS) {
-                MPID_Progress_end(&progress_state);
-                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winnoprogress");
-            }
-            /* --END ERROR HANDLING-- */
-        }
-        MPID_Progress_end(&progress_state);
-    }
-
-    /* Send a lock packet over to the target, wait for the lock_granted
-     * reply, and perform the RMA ops. */
-
-    if (win_ptr->targets[rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_CALLED) {
-        mpi_errno = send_lock_msg(rank, win_ptr->targets[rank].remote_lock_mode, win_ptr);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
-    }
+    mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, dest, &target);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+    if (target == NULL)
+        goto fn_exit;
 
-    if (win_ptr->targets[rank].remote_lock_state != MPIDI_CH3_WIN_LOCK_GRANTED) {
-        mpi_errno = wait_for_lock_granted(win_ptr, rank);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
+    /* Set sync_flag in sync struct. */
+    if (target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
+        target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
+        target->sync.have_remote_incomplete_ops = 0;
+        target->sync.outstanding_acks++;
     }
 
-    win_ptr->targets[rank].remote_lock_state = MPIDI_CH3_WIN_LOCK_FLUSH;
-    mpi_errno = do_passive_target_rma(win_ptr, rank, &wait_for_rma_done_pkt,
-                                      MPIDI_CH3_PKT_FLAG_RMA_FLUSH);
-    if (mpi_errno) {
+    /* Issue out all operations. */
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest,
+                                                    &made_progress);
+    if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
-    }
-
-    /* If the lock is a shared lock or we have done the single op optimization,
-     * we need to wait until the target informs us that all operations are done
-     * on the target.  This ensures that third-party communication can be done
-     * safely.  */
-    if (wait_for_rma_done_pkt == 1) {
-        /* wait until the "pt rma done" packet is received from the target.
-         * This packet resets the remote_lock_state flag. */
-
-        if (win_ptr->targets[rank].remote_lock_state != MPIDI_CH3_WIN_LOCK_GRANTED) {
-            MPID_Progress_state progress_state;
 
-            MPID_Progress_start(&progress_state);
-            while (win_ptr->targets[rank].remote_lock_state != MPIDI_CH3_WIN_LOCK_GRANTED) {
-                mpi_errno = MPID_Progress_wait(&progress_state);
-                /* --BEGIN ERROR HANDLING-- */
-                if (mpi_errno != MPI_SUCCESS) {
-                    MPID_Progress_end(&progress_state);
-                    MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winnoprogress");
-                }
-                /* --END ERROR HANDLING-- */
-            }
-            MPID_Progress_end(&progress_state);
+    /* Wait for remote completion. */
+    do {
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, target,
+                                                      &local_completed,
+                                                      &remote_completed);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+        if (!remote_completed) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
         }
-    }
-    else {
-        win_ptr->targets[rank].remote_lock_state = MPIDI_CH3_WIN_LOCK_GRANTED;
-    }
+    } while (!remote_completed);
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FLUSH);
@@ -1701,27 +1312,75 @@ int MPIDI_Win_flush(int rank, MPID_Win * win_ptr)
 #define FUNCNAME MPIDI_Win_flush_local
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_Win_flush_local(int rank, MPID_Win * win_ptr)
+int MPIDI_Win_flush_local(int dest, MPID_Win * win_ptr)
 {
+    int made_progress = 0;
+    int local_completed = 0, remote_completed = 0;
+    int rank = win_ptr->comm_ptr->rank;
+    MPIDI_RMA_Target_t *target = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FLUSH_LOCAL);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FLUSH_LOCAL);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_LOCK &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_LOCK_ALL,
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_PER_TARGET &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_CALLED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    /* Note: Local RMA calls don't poke the progress engine.  This routine
-     * should poke the progress engine when the local target is flushed to help
-     * make asynchronous progress.  Currently this is handled by Win_flush().
-     */
-
-    mpi_errno = win_ptr->RMAFns.Win_flush(rank, win_ptr);
-    if (mpi_errno != MPI_SUCCESS) {
+    /* When the process tries to acquire the lock on itself, it does not
+       go through the progress engine. Therefore, it is possible that
+       one process always grants the lock to itself but never process
+       events coming from other processes. This may cause deadlock in
+       applications where the program execution on target process depends
+       on the happening of events from other processes. Here we poke
+       the progress engine once to avoid such issue.  */
+    mpi_errno = poke_progress_engine();
+    if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
+
+    if (rank == dest)
+        goto fn_exit;
+
+    if (win_ptr->shm_allocated) {
+        MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+        MPIDI_Comm_get_vc(win_ptr->comm_ptr, dest, &target_vc);
+        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
+        if (orig_vc->node_id == target_vc->node_id)
+            goto fn_exit;
     }
 
+    mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, dest, &target);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+    if (target == NULL)
+        goto fn_exit;
+
+    /* Set sync_flag in sync struct. */
+    if (target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL)
+        target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
+
+    /* Issue out all operations. */
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest,
+                                                    &made_progress);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+
+    /* Wait for local completion. */
+    do {
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, target,
+                                                      &local_completed,
+                                                      &remote_completed);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+        if (!local_completed) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
+    } while (!local_completed);
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FLUSH_LOCAL);
     return mpi_errno;
@@ -1738,25 +1397,59 @@ int MPIDI_Win_flush_local(int rank, MPID_Win * win_ptr)
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
 {
+    int i, made_progress = 0;
+    int local_completed = 0, remote_completed = 0;
+    MPIDI_RMA_Target_t *curr_target = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FLUSH_LOCAL_ALL);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FLUSH_LOCAL_ALL);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_LOCK &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_LOCK_ALL,
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_PER_TARGET &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_CALLED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    /* Note: Local RMA calls don't poke the progress engine.  This routine
-     * should poke the progress engine when the local target is flushed to help
-     * make asynchronous progress.  Currently this is handled by Win_flush().
-     */
-
-    mpi_errno = win_ptr->RMAFns.Win_flush_all(win_ptr);
-    if (mpi_errno != MPI_SUCCESS) {
+    /* When the process tries to acquire the lock on itself, it does not
+       go through the progress engine. Therefore, it is possible that
+       one process always grants the lock to itself but never process
+       events coming from other processes. This may cause deadlock in
+       applications where the program execution on target process depends
+       on the happening of events from other processes. Here we poke
+       the progress engine once to avoid such issue.  */
+    mpi_errno = poke_progress_engine();
+    if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
+
+    /* Set sync_flag in sync struct. */
+    for (i = 0; i < win_ptr->num_slots; i++) {
+        curr_target = win_ptr->slots[i].target_list;
+        while (curr_target != NULL) {
+            if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL) {
+                curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
+            }
+            curr_target = curr_target->next;
+        }
     }
 
+    /* issue out all operations. */
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    /* Wait for local completion. */
+    do {
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
+                                                   &remote_completed);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+        if (!local_completed) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
+    } while (!local_completed);
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FLUSH_LOCAL_ALL);
     return mpi_errno;
@@ -1773,86 +1466,69 @@ int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_Win_lock_all(int assert, MPID_Win * win_ptr)
 {
+    int i, rank = win_ptr->comm_ptr->rank;
     int mpi_errno = MPI_SUCCESS;
-    MPIDI_VC_t *orig_vc, *target_vc;
-    int i;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_LOCK_ALL);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_LOCK_ALL);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_NONE,
+    /* Note that here we cannot distinguish if this access epoch is overlapped
+       with an access epoch of FENCE (which is not allowed), since FENCE may be
+       ended up with not unsetting the window state. We can only detect if this
+       access epoch is overlapped with another access epoch of PSCW or Passive
+       Target. */
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_NONE &&
+                        win_ptr->states.access_state != MPIDI_RMA_FENCE_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_FENCE_GRANTED,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    /* Track access epoch state */
-    win_ptr->epoch_state = MPIDI_EPOCH_LOCK_ALL;
+    if (assert & MPI_MODE_NOCHECK)
+        win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_GRANTED;
+    else
+        win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_CALLED;
+    num_passive_win++;
 
-    /* Set the target's lock state to "called" for all targets */
-    /* FIXME: Don't use this O(p) approach */
-    for (i = 0; i < MPIR_Comm_size(win_ptr->comm_ptr); i++) {
-        MPIU_Assert(win_ptr->targets[i].remote_lock_state == MPIDI_CH3_WIN_LOCK_NONE);
+    win_ptr->lock_all_assert = assert;
 
-        win_ptr->targets[i].remote_lock_state = MPIDI_CH3_WIN_LOCK_CALLED;
-        win_ptr->targets[i].remote_lock_mode = MPI_LOCK_SHARED;
-        win_ptr->targets[i].remote_lock_assert = assert;
-    }
+    MPIU_Assert(win_ptr->outstanding_locks == 0);
 
-    /* Immediately lock the local process for load/store access */
-    mpi_errno = acquire_local_lock(win_ptr, MPI_LOCK_SHARED);
-    if (mpi_errno != MPI_SUCCESS) {
-        MPIU_ERR_POP(mpi_errno);
-    }
+    /* Acquire the lock on myself and the lock on processes on SHM.
+       No need to create a target for them. */
+    if (!(win_ptr->lock_all_assert & MPI_MODE_NOCHECK)) {
+        win_ptr->outstanding_locks++;
+        mpi_errno = acquire_local_lock(win_ptr, MPI_LOCK_SHARED);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
-    if (win_ptr->shm_allocated == TRUE) {
-        /* Immediately lock all targets for load/store access */
-
-        for (i = 0; i < MPIR_Comm_size(win_ptr->comm_ptr); i++) {
-            /* Local process is already locked */
-            if (i == win_ptr->comm_ptr->rank)
-                continue;
-
-            if (win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
-                /* check if target is local and shared memory is allocated on window,
-                 * if so, we directly send lock request and wait for lock reply. */
-
-                /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
-                 * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
-                 * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
-                 * which is only set to TRUE when SHM region is allocated in nemesis.
-                 * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
-                 */
-                MPIDI_Comm_get_vc(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &orig_vc);
+        if (win_ptr->shm_allocated == TRUE) {
+            MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+            MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
+            for (i = 0; i < win_ptr->comm_ptr->local_size; i++) {
+                if (i == rank)
+                    continue;
                 MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
-            }
-
-            if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
-                orig_vc->node_id == target_vc->node_id) {
-                mpi_errno = send_lock_msg(i, MPI_LOCK_SHARED, win_ptr);
-                if (mpi_errno) {
-                    MPIU_ERR_POP(mpi_errno);
+                if (orig_vc->node_id == target_vc->node_id) {
+                    win_ptr->outstanding_locks++;
+                    mpi_errno = send_lock_msg(i, MPI_LOCK_SHARED, win_ptr);
+                    if (mpi_errno != MPI_SUCCESS)
+                        MPIU_ERR_POP(mpi_errno);
                 }
             }
         }
 
-        for (i = 0; i < MPIR_Comm_size(win_ptr->comm_ptr); i++) {
-            /* Local process is already locked */
-            if (i == win_ptr->comm_ptr->rank)
-                continue;
-
-            if (win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
-                MPIDI_Comm_get_vc(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &orig_vc);
-                MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
-            }
-
-            if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
-                orig_vc->node_id == target_vc->node_id) {
-                mpi_errno = wait_for_lock_granted(win_ptr, i);
-                if (mpi_errno) {
-                    MPIU_ERR_POP(mpi_errno);
-                }
-            }
+        /* wait for lock to be granted */
+        while (win_ptr->outstanding_locks > 0) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
         }
     }
 
+    /* Ensure ordering of load/store operations. */
+    if (win_ptr->shm_allocated == TRUE) {
+        OPA_read_write_barrier();
+    }
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_LOCK_ALL);
     return mpi_errno;
@@ -1869,27 +1545,137 @@ int MPIDI_Win_lock_all(int assert, MPID_Win * win_ptr)
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
 {
+    int i, made_progress = 0;
+    int local_completed = 0,remote_completed = 0;
+    int rank = win_ptr->comm_ptr->rank;
+    MPIDI_RMA_Target_t *curr_target = NULL;
+    enum MPIDI_RMA_sync_types sync_flag;
     int mpi_errno = MPI_SUCCESS;
-    int i;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_UNLOCK_ALL);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_UNLOCK_ALL);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_LOCK_ALL,
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_CALLED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    /* Note: Win_unlock currently provides a fence for shared memory windows.
-     * If the implementation changes, a fence is needed here. */
+    /* Ensure ordering of load/store operations. */
+    if (win_ptr->shm_allocated) {
+        OPA_read_write_barrier();
+    }
 
-    for (i = 0; i < MPIR_Comm_size(win_ptr->comm_ptr); i++) {
-        mpi_errno = win_ptr->RMAFns.Win_unlock(i, win_ptr);
-        if (mpi_errno != MPI_SUCCESS) {
+    MPIU_Assert(win_ptr->outstanding_unlocks == 0);
+
+    /* Unlock MYSELF and processes on SHM. */
+    if (!(win_ptr->lock_all_assert & MPI_MODE_NOCHECK)) {
+        mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
+        if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
+
+        if (win_ptr->shm_allocated == TRUE) {
+            MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+            MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
+            for (i = 0; i < win_ptr->comm_ptr->local_size; i++) {
+                if (i == rank) continue;
+                MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
+                if (orig_vc->node_id == target_vc->node_id) {
+                    win_ptr->outstanding_unlocks++;
+                    mpi_errno = send_unlock_msg(i, win_ptr);
+                    if (mpi_errno != MPI_SUCCESS)
+                        MPIU_ERR_POP(mpi_errno);
+                }
+            }
+        }
+    }
+
+    /* Set sync_flag in sync struct. */
+    if (win_ptr->lock_all_assert & MPI_MODE_NOCHECK)
+        sync_flag = MPIDI_RMA_SYNC_FLUSH;
+    else
+        sync_flag = MPIDI_RMA_SYNC_UNLOCK;
+
+    if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
+        for (i = 0; i < win_ptr->num_slots; i++) {
+            curr_target = win_ptr->slots[i].target_list;
+            while (curr_target != NULL) {
+                if (curr_target->sync.sync_flag < sync_flag) {
+                    curr_target->sync.sync_flag = sync_flag;
+                    curr_target->sync.have_remote_incomplete_ops = 0;
+                    curr_target->sync.outstanding_acks++;
+                }
+                curr_target = curr_target->next;
+            }
+        }
+    }
+    else {
+        for (i = 0; i < win_ptr->comm_ptr->local_size; i++) {
+            if (win_ptr->comm_ptr->local_size <= win_ptr->num_slots)
+                curr_target = win_ptr->slots[i].target_list;
+            else {
+                curr_target = win_ptr->slots[i % win_ptr->num_slots].target_list;
+                while (curr_target != NULL && curr_target->target_rank != i)
+                    curr_target = curr_target->next;
+            }
+
+            if (curr_target != NULL) {
+                if (curr_target->sync.sync_flag < sync_flag) {
+                    curr_target->sync.sync_flag = sync_flag;
+                    curr_target->sync.have_remote_incomplete_ops = 0;
+                    curr_target->sync.outstanding_acks++;
+                }
+            }
+            else {
+                if (win_ptr->lock_all_assert & MPI_MODE_NOCHECK)
+                    continue;
+                if (i == rank)
+                    continue;
+                if (win_ptr->shm_allocated == TRUE) {
+                    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+                    MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
+                    MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
+                    if (orig_vc->node_id == target_vc->node_id)
+                        continue;
+                }
+
+                win_ptr->outstanding_unlocks++;
+                mpi_errno = send_unlock_msg(i, win_ptr);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
         }
     }
 
-    /* Track access epoch state */
-    win_ptr->epoch_state = MPIDI_EPOCH_NONE;
+    /* Issue out all operations. */
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+
+    /* Wait for remote completion. */
+    do {
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
+                                                   &remote_completed);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (!remote_completed || win_ptr->outstanding_unlocks) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
+    } while (!remote_completed || win_ptr->outstanding_unlocks);
+
+    /* Cleanup all targets on this window. */
+    mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    MPIU_Assert(win_ptr->non_empty_slots == 0);
+
+    win_ptr->lock_all_assert = 0;
+    win_ptr->posted_ops_cnt = 0;
+    MPIU_Assert(win_ptr->active_req_cnt == 0);
+
+    win_ptr->states.access_state = MPIDI_EPOCH_NONE;
+    num_passive_win--;
+    MPIU_Assert(num_passive_win >= 0);
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_UNLOCK_ALL);
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index 6226d73..e9c1ee4 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -30,6 +30,8 @@ cvars:
 MPIU_THREADSAFE_INIT_DECL(initRMAoptions);
 
 MPIDI_RMA_Win_list_t *MPIDI_RMA_Win_list = NULL, *MPIDI_RMA_Win_list_tail = NULL;
+int num_active_issued_win = 0;
+int num_passive_win = 0;
 
 static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
                     MPID_Comm * comm_ptr, MPID_Win ** win_ptr);
@@ -304,7 +306,6 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
 
     MPIU_Object_set_ref(*win_ptr, 1);
 
-    (*win_ptr)->fence_issued = 0;
     /* (*win_ptr)->errhandler is set by upper level; */
     /* (*win_ptr)->base is set by caller; */
     (*win_ptr)->size = size;
@@ -312,8 +313,6 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->create_flavor = create_flavor;
     (*win_ptr)->model = model;
     (*win_ptr)->attributes = NULL;
-    (*win_ptr)->start_group_ptr = NULL;
-    (*win_ptr)->start_assert = 0;
     (*win_ptr)->comm_ptr = win_comm_ptr;
 
     (*win_ptr)->at_completion_counter = 0;
@@ -334,6 +333,14 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->non_empty_slots = 0;
     (*win_ptr)->posted_ops_cnt = 0;
     (*win_ptr)->active_req_cnt = 0;
+    (*win_ptr)->fence_sync_req = MPI_REQUEST_NULL;
+    (*win_ptr)->start_req = NULL;
+    (*win_ptr)->start_ranks_in_win_grp = NULL;
+    (*win_ptr)->start_grp_size = 0;
+    (*win_ptr)->lock_all_assert = 0;
+    (*win_ptr)->lock_epoch_count = 0;
+    (*win_ptr)->outstanding_locks = 0;
+    (*win_ptr)->outstanding_unlocks = 0;
 
     /* Initialize the passive target lock state */
     MPIU_CHKPMEM_MALLOC((*win_ptr)->targets, struct MPIDI_Win_target_state *,
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index 4648845..6e63c0f 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -147,6 +147,20 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FREE);
 
+    /* it is possible that there is a IBARRIER in MPI_WIN_FENCE with
+       MODE_NOPRECEDE not being completed, we let the progress engine
+       to delete its request when it is completed. */
+    if ((*win_ptr)->fence_sync_req != MPI_REQUEST_NULL) {
+        MPID_Request *req_ptr;
+        MPID_Request_get_ptr((*win_ptr)->fence_sync_req, req_ptr);
+        MPID_Request_release(req_ptr);
+        (*win_ptr)->fence_sync_req = MPI_REQUEST_NULL;
+        (*win_ptr)->states.access_state = MPIDI_RMA_NONE;
+    }
+
+    if ((*win_ptr)->states.access_state == MPIDI_RMA_FENCE_GRANTED)
+        (*win_ptr)->states.access_state = MPIDI_RMA_NONE;
+
     MPIU_ERR_CHKANDJUMP((*win_ptr)->states.access_state != MPIDI_RMA_NONE ||
                         (*win_ptr)->states.exposure_state != MPIDI_RMA_NONE,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

http://git.mpich.org/mpich.git/commitdiff/257faca27b708b23716da17b4b799a268733f156

commit 257faca27b708b23716da17b4b799a268733f156
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Oct 27 06:14:44 2014 -0500

    Control no. of active RMA requests in the runtime.
    
    When there are too many active requests in the runtime,
    the internal memory might be used up. This patch
    prevents such situation by triggering blocking
    wait loop in operation routines when no. of active
    requests reaches certain threshold value.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index a9af2d4..1bb0a54 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -375,6 +375,8 @@ static inline int MPIDI_CH3I_RMA_Cleanup_ops_target(MPID_Win * win_ptr, MPIDI_RM
             /* dequeue the operation and free it */
             MPL_LL_DELETE(*op_list, *op_list_tail, curr_op);
             MPIDI_CH3I_Win_op_free(win_ptr, curr_op);
+            win_ptr->active_req_cnt--;
+
             if (*op_list == NULL) {
                 if (read_flag == 1) {
                     read_flag = 0;
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index c6e45a3..aba695e 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -380,6 +380,9 @@ struct MPIDI_Win_target_state {
                            in current epoch (accumulated value, not      \
                            current value) to control when to poke        \
                            progress engine in RMA operation routines. */ \
+    int active_req_cnt; /* keep track of number of active requests in    \
+                           current epoch, i.e., number of issued but     \
+                           incomplete RMA operations. */                 \
 
 #ifdef MPIDI_CH3_WIN_DECL
 #define MPID_DEV_WIN_DECL \
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index 336100c..01afed9 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -21,6 +21,24 @@ cvars:
       description : >-
         Use the immediate accumulate optimization
 
+    - name        : MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD
+      category    : CH3
+      type        : int
+      default     : 2097152
+      class       : none
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_ALL_EQ
+      description : >-
+         Threshold of number of active requests to trigger
+         blocking waiting in operation routines. When the
+         value is negative, we never blockingly wait in
+         operation routines. When the value is zero, we always
+         trigger blocking waiting in operation routines to
+         wait until no. of active requests becomes zero. When the
+         value is positive, we do blocking waiting in operation
+         routines to wait until no. of active requests being
+         reduced to this value.
+
 === END_MPI_T_CVAR_INFO_BLOCK ===
 */
 
@@ -356,6 +374,7 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *targe
                 MPIDI_CH3I_RMA_Ops_append(&(target->read_op_list),
                                           &(target->read_op_list_tail), curr_op);
             }
+            win_ptr->active_req_cnt++;
         }
 
         curr_op = target->next_op_to_issue;
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 61563b4..af24f11 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -143,6 +143,20 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
         }
+
+        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
+            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
+            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
+                int local_completed = 0, remote_completed = 0;
+                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
+                                                           &remote_completed);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+                mpi_errno = poke_progress_engine();
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
+        }
     }
 
   fn_exit:
@@ -265,6 +279,20 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
         }
+
+        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
+            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
+            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
+                int local_completed = 0, remote_completed = 0;
+                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
+                                                           &remote_completed);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+                mpi_errno = poke_progress_engine();
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
+        }
     }
 
   fn_exit:
@@ -424,6 +452,20 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
         }
+
+        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
+            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
+            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
+                int local_completed = 0, remote_completed = 0;
+                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
+                                                           &remote_completed);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+                mpi_errno = poke_progress_engine();
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
+        }
     }
 
   fn_exit:
@@ -579,6 +621,20 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
         }
+
+        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
+            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
+            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
+                int local_completed = 0, remote_completed = 0;
+                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
+                                                           &remote_completed);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+                mpi_errno = poke_progress_engine();
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
+        }
     }
 
   fn_exit:
@@ -685,6 +741,20 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
         }
+
+        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
+            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
+            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
+                int local_completed = 0, remote_completed = 0;
+                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
+                                                           &remote_completed);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+                mpi_errno = poke_progress_engine();
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
+        }
     }
 
   fn_exit:
@@ -788,6 +858,20 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
         }
+
+        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
+            win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
+            while (win_ptr->active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
+                int local_completed = 0, remote_completed = 0;
+                mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
+                                                           &remote_completed);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+                mpi_errno = poke_progress_engine();
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
+        }
     }
 
   fn_exit:
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index 7d3cc03..6226d73 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -333,6 +333,7 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->states.exposure_state = MPIDI_RMA_NONE;
     (*win_ptr)->non_empty_slots = 0;
     (*win_ptr)->posted_ops_cnt = 0;
+    (*win_ptr)->active_req_cnt = 0;
 
     /* Initialize the passive target lock state */
     MPIU_CHKPMEM_MALLOC((*win_ptr)->targets, struct MPIDI_Win_target_state *,

http://git.mpich.org/mpich.git/commitdiff/33d96690f53ce6bfa091ad9a36e299f1393f32aa

commit 33d96690f53ce6bfa091ad9a36e299f1393f32aa
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Oct 27 06:07:28 2014 -0500

    Enable making progress in operation routines.
    
    We no longer use the lazy-issuing model, which delays
    all operations to the end to issue, but issues them
    as early as possible. To achieve this, we enable
    making progress in RMA routines, so that RMA operations
    can be issued out as long as synchronization is finished.
    
    Sometimes we also need to poke the progress in
    operation routines to make sure that target side
    makes enough progress to receiving packets. Here
    we trigger it when no. of posted operations reaches
    certain threshold value.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index 027910a..c6e45a3 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -376,6 +376,10 @@ struct MPIDI_Win_target_state {
         enum MPIDI_RMA_states exposure_state;                            \
     } states;                                                            \
     int non_empty_slots;                                                 \
+    int posted_ops_cnt; /* keep track of number of posted RMA operations \
+                           in current epoch (accumulated value, not      \
+                           current value) to control when to poke        \
+                           progress engine in RMA operation routines. */ \
 
 #ifdef MPIDI_CH3_WIN_DECL
 #define MPID_DEV_WIN_DECL \
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 911f457..61563b4 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -11,6 +11,30 @@ static int enableShortACC = 1;
 #define MPIDI_PASSIVE_TARGET_DONE_TAG  348297
 #define MPIDI_PASSIVE_TARGET_RMA_TAG 563924
 
+/*
+=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
+
+cvars:
+    - name        : MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS
+      category    : CH3
+      type        : int
+      default     : 100
+      class       : none
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_ALL_EQ
+      description : >-
+          Specify the threshold of number of posted operations
+          when starting poking progress in operation routines.
+          When the value is negative, runtime never pokes progress
+          engine in operation routines; when the value is zero,
+          runtime always pokes progress engine in operation
+          routines; when the value is larger than zero, runtime
+          starts to poke progress engine when number of posted
+          operations reaches that value.
+
+=== END_MPI_T_CVAR_INFO_BLOCK ===
+*/
+
 #undef FUNCNAME
 #define FUNCNAME MPIDI_Put
 #undef FCNAME
@@ -25,6 +49,7 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
     MPI_Aint dt_true_lb ATTRIBUTE((unused));
     MPIDI_msg_sz_t data_sz;
     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+    int made_progress = 0;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_PUT);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_PUT);
@@ -107,6 +132,17 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
             MPID_Datatype_add_ref(dtp);
             new_ptr->is_dt = 1;
         }
+
+        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        win_ptr->posted_ops_cnt++;
+        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
+            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
+            mpi_errno = poke_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
     }
 
   fn_exit:
@@ -135,6 +171,7 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
     MPI_Aint dt_true_lb ATTRIBUTE((unused));
     MPID_Datatype *dtp;
     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+    int made_progress = 0;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET);
@@ -217,6 +254,17 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
             MPID_Datatype_add_ref(dtp);
             new_ptr->is_dt = 1;
         }
+
+        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        win_ptr->posted_ops_cnt++;
+        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
+            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
+            mpi_errno = poke_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
     }
 
   fn_exit:
@@ -245,6 +293,7 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
     MPI_Aint dt_true_lb ATTRIBUTE((unused));
     MPID_Datatype *dtp;
     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+    int made_progress = 0;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE);
@@ -326,7 +375,7 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                 if (mpi_errno)
                     MPIU_ERR_POP(mpi_errno);
 
-                goto fn_exit;
+                goto issue_ops;
             }
         }
 
@@ -363,6 +412,18 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
             MPID_Datatype_add_ref(dtp);
             new_ptr->is_dt = 1;
         }
+
+ issue_ops:
+        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        win_ptr->posted_ops_cnt++;
+        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
+            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
+            mpi_errno = poke_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
     }
 
   fn_exit:
@@ -393,6 +454,7 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
     MPI_Aint dt_true_lb ATTRIBUTE((unused));
     MPID_Datatype *dtp;
     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+    int made_progress = 0;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET_ACCUMULATE);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET_ACCUMULATE);
@@ -506,6 +568,17 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
             MPID_Datatype_add_ref(dtp);
             new_ptr->is_dt = 1;
         }
+
+        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        win_ptr->posted_ops_cnt++;
+        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
+            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
+            mpi_errno = poke_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
     }
 
   fn_exit:
@@ -530,6 +603,7 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
     int mpi_errno = MPI_SUCCESS;
     int rank;
     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+    int made_progress = 0;
 
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_COMPARE_AND_SWAP);
 
@@ -600,6 +674,17 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
+
+        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        win_ptr->posted_ops_cnt++;
+        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
+            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
+            mpi_errno = poke_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
     }
 
   fn_exit:
@@ -623,6 +708,7 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
     int mpi_errno = MPI_SUCCESS;
     int rank;
     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+    int made_progress = 0;
 
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_FETCH_AND_OP);
 
@@ -691,6 +777,17 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
+
+        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        win_ptr->posted_ops_cnt++;
+        if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
+            win_ptr->posted_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
+            mpi_errno = poke_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
     }
 
   fn_exit:
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index c950116..7d3cc03 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -332,6 +332,7 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->states.access_state = MPIDI_RMA_NONE;
     (*win_ptr)->states.exposure_state = MPIDI_RMA_NONE;
     (*win_ptr)->non_empty_slots = 0;
+    (*win_ptr)->posted_ops_cnt = 0;
 
     /* Initialize the passive target lock state */
     MPIU_CHKPMEM_MALLOC((*win_ptr)->targets, struct MPIDI_Win_target_state *,

http://git.mpich.org/mpich.git/commitdiff/5dd5515429c5f5fd9b7426f9f0956f16143e8aad

commit 5dd5515429c5f5fd9b7426f9f0956f16143e8aad
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Nov 2 22:25:07 2014 -0600

    Implement GET_OP routine which guarantees to return an OP.
    
    GET_OP function may be a blocking function which guarantees
    to return an RMA operation.
    
    Inside GET_OP we first call the normal OP_ALLOC function
    which will try to get a new OP from OP pools; if failed,
    we call nonblocking GC function to cleanup completed ops
    and then call OP_ALLOC again; if we still cannot get a
    new OP, we call nonblocking FREE_OP_BEFORE_COMPLETION
    function if hardware ordering is provided and then call
    OP_ALLOC again; if still failed, finally we call blocking
    aggressive cleanup function, which will guarantee to
    return a new OP element.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index e0d565a..a9af2d4 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -517,6 +517,49 @@ static inline int MPIDI_CH3I_RMA_Cleanup_targets_win(MPID_Win *win_ptr)
     goto fn_exit;
 }
 
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Win_get_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Win_get_op(MPID_Win * win_ptr, MPIDI_RMA_Op_t **e)
+{
+    MPIDI_RMA_Op_t *new_ptr = NULL;
+    int local_completed = 0, remote_completed = 0;
+    int mpi_errno = MPI_SUCCESS;
+
+    while (1) {
+        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
+        if (new_ptr != NULL) break;
+
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr,
+                                                   &local_completed,
+                                                   &remote_completed);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
+        if (new_ptr != NULL) break;
+
+        if (MPIDI_RMA_Pkt_orderings->flush_remote) {
+            mpi_errno = MPIDI_CH3I_RMA_Free_ops_before_completion(win_ptr);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
+
+        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
+        if (new_ptr != NULL) break;
+
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+
+    (*e) = new_ptr;
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
 /* Return nonzero if the RMA operations list is empty.
  */
 #undef FUNCNAME
@@ -586,11 +629,8 @@ static inline int MPIDI_CH3I_RMA_Ops_alloc_tail(MPID_Win * win_ptr, MPIDI_RMA_Op
     int mpi_errno = MPI_SUCCESS;
     MPIDI_RMA_Op_t *tmp_ptr;
 
-    tmp_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-    if (tmp_ptr == NULL) {
-        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &tmp_ptr);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-    }
+    mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &tmp_ptr);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
     MPL_LL_APPEND(*list, *list_tail, tmp_ptr);
 
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index c2ab3a0..027910a 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -303,6 +303,14 @@ struct MPIDI_Win_info_args {
 
 struct MPIDI_RMA_op;            /* forward decl from mpidrma.h */
 
+typedef struct MPIDI_RMA_Pkt_orderings {
+    int flush_remote; /* ordered FLUSH, for remote completion */
+    /* FIXME: in future we should also add local completin
+       ordering: WAW, WAR, RAW, RAR. */
+} MPIDI_RMA_Pkt_orderings_t;
+
+extern MPIDI_RMA_Pkt_orderings_t *MPIDI_RMA_Pkt_orderings;
+
 struct MPIDI_Win_target_state {
     struct MPIDI_RMA_Op *rma_ops_list;
                                 /* List of outstanding RMA operations */
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 36d0d1f..911f457 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -72,11 +72,8 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
         MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
 
         /* queue it up */
-        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-        if (new_ptr == NULL) {
-            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &new_ptr);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-        }
+        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
         put_pkt = &(new_ptr->pkt.put);
         MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
@@ -185,11 +182,8 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
         MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
 
         /* queue it up */
-        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-        if (new_ptr == NULL) {
-            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &new_ptr);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-        }
+        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
         get_pkt = &(new_ptr->pkt.get);
         MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
@@ -299,11 +293,8 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
         MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
 
         /* queue it up */
-        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-        if (new_ptr == NULL) {
-            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &new_ptr);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-        }
+        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
         /* If predefined and contiguous, use a simplified element */
         if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
@@ -450,11 +441,8 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
         MPIDI_RMA_Op_t *new_ptr = NULL;
 
         /* Append the operation to the window's RMA ops queue */
-        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-        if (new_ptr == NULL) {
-            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &new_ptr);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-        }
+        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
         /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */
 
@@ -589,11 +577,8 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL;
 
         /* Append this operation to the RMA ops queue */
-        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-        if (new_ptr == NULL) {
-            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &new_ptr);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-        }
+        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
         cas_pkt = &(new_ptr->pkt.cas);
         MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS);
@@ -684,11 +669,8 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
         MPIDI_CH3_Pkt_fop_t *fop_pkt = NULL;
 
         /* Append this operation to the RMA ops queue */
-        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-        if (new_ptr == NULL) {
-            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &new_ptr);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-        }
+        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
         fop_pkt = &(new_ptr->pkt.fop);
         MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index b2d851d..4648845 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -65,6 +65,7 @@ cvars:
 
 struct MPIDI_RMA_Op *global_rma_op_pool = NULL, *global_rma_op_pool_tail = NULL, *global_rma_op_pool_start = NULL;
 struct MPIDI_RMA_Target *global_rma_target_pool = NULL, *global_rma_target_pool_tail = NULL, *global_rma_target_pool_start = NULL;
+struct MPIDI_RMA_Pkt_orderings *MPIDI_RMA_Pkt_orderings = NULL;
 
 #undef FUNCNAME
 #define FUNCNAME MPIDI_RMA_init
@@ -74,7 +75,7 @@ int MPIDI_RMA_init(void)
 {
     int mpi_errno = MPI_SUCCESS;
     int i;
-    MPIU_CHKPMEM_DECL(2);
+    MPIU_CHKPMEM_DECL(3);
 
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_RMA_INIT);
 
@@ -96,6 +97,13 @@ int MPIDI_RMA_init(void)
         MPL_LL_APPEND(global_rma_target_pool, global_rma_target_pool_tail, &(global_rma_target_pool_start[i]));
     }
 
+    MPIU_CHKPMEM_MALLOC(MPIDI_RMA_Pkt_orderings, struct MPIDI_RMA_Pkt_orderings *,
+                        sizeof(struct MPIDI_RMA_Pkt_orderings),
+                        mpi_errno, "RMA packet orderings");
+    /* FIXME: here we should let channel to set ordering flags. For now we just set them
+       in CH3 layer. */
+    MPIDI_RMA_Pkt_orderings->flush_remote = 1;
+
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_RMA_INIT);
     return mpi_errno;
@@ -118,6 +126,7 @@ void MPIDI_RMA_finalize(void)
 
     MPIU_Free(global_rma_op_pool_start);
     MPIU_Free(global_rma_target_pool_start);
+    MPIU_Free(MPIDI_RMA_Pkt_orderings);
 
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_RMA_FINALIZE);
 }

http://git.mpich.org/mpich.git/commitdiff/7c1e12f0cfb0d4370375becf6c3cf4db9607323e

commit 7c1e12f0cfb0d4370375becf6c3cf4db9607323e
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Nov 2 22:24:24 2014 -0600

    Free incomplete ops when FLUSH ordering is provided.
    
    When FLUSH sync is issued and remote completion
    ordering between the last FLUSH message and all
    previous ops is provided by curent hardware, we
    no longer need to maintain incomplete operations
    but only need to wait for the ACK of current
    FLUSH. Therefore we can free those operation
    resources without blocking waiting.
    
    Not that if we do this, we temporarily lose the
    opportunity to do a real FLUSH_LOCAl until the
    current FLUSH ACK is received.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 3428db5..e0d565a 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -10,6 +10,7 @@
 #include "mpl_utlist.h"
 #include "mpid_rma_types.h"
 
+int MPIDI_CH3I_RMA_Free_ops_before_completion(MPID_Win * win_ptr);
 int MPIDI_CH3I_RMA_Cleanup_ops_aggressive(MPID_Win * win_ptr);
 int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Target_t ** target);
 int MPIDI_CH3I_RMA_Make_progress_target(MPID_Win * win_ptr, int target_rank, int *made_progress);
@@ -112,6 +113,7 @@ static inline MPIDI_RMA_Target_t *MPIDI_CH3I_Win_target_alloc(MPID_Win * win_ptr
     e->lock_type = MPIDI_RMA_LOCK_TYPE_NONE;
     e->lock_mode = 0;
     e->outstanding_lock = 0;
+    e->disable_flush_local = 0;
 
     e->sync.sync_flag = MPIDI_RMA_NONE;
     e->sync.outstanding_acks = 0;
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index 2158320..27d137a 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -95,6 +95,7 @@ typedef struct MPIDI_RMA_Target {
     enum MPIDI_RMA_Lock_type lock_type; /* SHARED, EXCLUSIVE */
     int lock_mode;              /* e.g., MODE_NO_CHECK */
     int outstanding_lock;
+    int disable_flush_local;
 
     /* The target structure is free to be cleaned up when all of the
      * following conditions hold true:
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index de33e15..336100c 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -423,6 +423,77 @@ static inline int issue_ops_win(MPID_Win *win_ptr, int *made_progress)
 
 
 #undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_RMA_Free_ops_before_completion
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3I_RMA_Free_ops_before_completion(MPID_Win * win_ptr)
+{
+    MPIDI_RMA_Op_t *curr_op = NULL;
+    MPIDI_RMA_Target_t *curr_target = NULL;
+    struct MPIDI_RMA_Op **op_list = NULL, **op_list_tail = NULL;
+    int read_flag = 0;
+    int i, made_progress = 0;
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER,
+                        "**rmanoop");
+
+    /* make nonblocking progress once */
+    if (win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
+        win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED) {
+        mpi_errno = issue_ops_win(win_ptr, &made_progress);
+        if (mpi_errno != MPI_SUCCESS) {MPIU_ERR_POP(mpi_errno);}
+    }
+    if (win_ptr->states.access_state != MPIDI_RMA_FENCE_GRANTED)
+        goto fn_exit;
+
+    /* find targets that have operations */
+    for (i = 0; i < win_ptr->num_slots; i++) {
+        if (win_ptr->slots[i].target_list != NULL) {
+            curr_target = win_ptr->slots[i].target_list;
+            while (curr_target != NULL && curr_target->read_op_list == NULL
+                   && curr_target->write_op_list == NULL)
+                curr_target = curr_target->next;
+            if (curr_target != NULL) break;
+        }
+    }
+    if (curr_target == NULL) goto fn_exit;
+
+    curr_target->disable_flush_local = 1;
+
+    if (curr_target->read_op_list != NULL) {
+        op_list = &curr_target->read_op_list;
+        op_list_tail = &curr_target->read_op_list_tail;
+        read_flag = 1;
+    }
+    else {
+        op_list = &curr_target->write_op_list;
+        op_list_tail = &curr_target->write_op_list_tail;
+    }
+
+    /* free all ops in the list since we do not need to maintain them anymore */
+    for (curr_op = *op_list; curr_op != NULL; ) {
+        MPID_Request_release(curr_op->request);
+        MPL_LL_DELETE(*op_list, *op_list_tail, curr_op);
+        MPIDI_CH3I_Win_op_free(win_ptr, curr_op);
+        if (*op_list == NULL) {
+            if (read_flag == 1) {
+                op_list = &curr_target->write_op_list;
+                op_list = &curr_target->write_op_list_tail;
+                read_flag = 0;
+            }
+        }
+        curr_op = *op_list;
+   }
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
 #define FUNCNAME MPIDI_CH3I_RMA_Cleanup_ops_aggressive
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)

http://git.mpich.org/mpich.git/commitdiff/41a365ec8806b5f53666d7693f5d50745e2458bf

commit 41a365ec8806b5f53666d7693f5d50745e2458bf
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Nov 2 22:17:07 2014 -0600

    Add blocking ops / targets aggressively cleanup functions.
    
    When we run out of resources for operations and targets,
    we need to make the runtime to complete some operations
    so that it can free some resources.
    
    For RMA operations, we implement by doing an internal
    FLUSH_LOCAL for one target and waiting for operation
    resources; for RMA targets, we implement by doing an
    internal FLUSH operation for one target and wait for
    target resources.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpi/errhan/errnames.txt b/src/mpi/errhan/errnames.txt
index 4007488..011d9fa 100644
--- a/src/mpi/errhan/errnames.txt
+++ b/src/mpi/errhan/errnames.txt
@@ -396,6 +396,8 @@ be in the range 0 to %d
 **rmaattach:Memory cannot be attached
 **rmashared:Memory cannot be shared
 **rmaflavor:Incorrect window flavor
+**rmanoop:No RMA operation resources can be freed from the window
+**rmanotarget:No RMA target resources can be freed from the window
 **assert:Invalid assert argument
 **lockassertval:Invalid assert argument passed to MPI_Win_lock
 **lockassertval %d: Invalid assert argument (%d) passed to MPI_Win_lock
diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 6e125c2..3428db5 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -10,6 +10,8 @@
 #include "mpl_utlist.h"
 #include "mpid_rma_types.h"
 
+int MPIDI_CH3I_RMA_Cleanup_ops_aggressive(MPID_Win * win_ptr);
+int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Target_t ** target);
 int MPIDI_CH3I_RMA_Make_progress_target(MPID_Win * win_ptr, int target_rank, int *made_progress);
 int MPIDI_CH3I_RMA_Make_progress_win(MPID_Win * win_ptr, int *made_progress);
 
@@ -166,7 +168,10 @@ static inline int MPIDI_CH3I_Win_create_target(MPID_Win * win_ptr, int target_ra
         slot = &(win_ptr->slots[target_rank]);
 
     t = MPIDI_CH3I_Win_target_alloc(win_ptr);
-    MPIU_ERR_CHKANDJUMP(t == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
+    if (t == NULL) {
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_target_aggressive(win_ptr, &t);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
 
     t->target_rank = target_rank;
 
@@ -580,7 +585,10 @@ static inline int MPIDI_CH3I_RMA_Ops_alloc_tail(MPID_Win * win_ptr, MPIDI_RMA_Op
     MPIDI_RMA_Op_t *tmp_ptr;
 
     tmp_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-    MPIU_ERR_CHKANDJUMP(tmp_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
+    if (tmp_ptr == NULL) {
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &tmp_ptr);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
 
     MPL_LL_APPEND(*list, *list_tail, tmp_ptr);
 
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index d8e0bc4..de33e15 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -423,6 +423,156 @@ static inline int issue_ops_win(MPID_Win *win_ptr, int *made_progress)
 
 
 #undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_ops_aggressive
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3I_RMA_Cleanup_ops_aggressive(MPID_Win * win_ptr)
+{
+    int i, local_completed = 0, remote_completed = 0;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_RMA_Target_t *curr_target = NULL;
+    int made_progress = 0;
+
+    /* If we are in an aggressive cleanup, the window must be holding
+     * up resources.  If it isn't, we are in the wrong window and
+     * incorrectly entered this function. */
+    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER,
+                        "**rmanoop");
+
+    /* find the first target that has something to issue */
+    for (i = 0; i < win_ptr->num_slots; i++) {
+        if (win_ptr->slots[i].target_list != NULL) {
+            curr_target = win_ptr->slots[i].target_list;
+            while (curr_target != NULL && curr_target->pending_op_list == NULL)
+                curr_target = curr_target->next;
+            if (curr_target != NULL) break;
+        }
+    }
+
+    if (curr_target == NULL) goto fn_exit;
+
+    if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL)
+        curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
+
+    /* Issue out all operations. */
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, curr_target->target_rank,
+                                                    &made_progress);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+
+    /* Wait for local completion. */
+    do {
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target,
+                                                      &local_completed,
+                                                      &remote_completed);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+        if (!local_completed) {
+            mpi_errno = wait_progress_engine();
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
+    } while (!local_completed);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_target_aggressive
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Target_t ** target)
+{
+    int i, local_completed = 0, remote_completed = 0;
+    int made_progress = 0;
+    MPIDI_RMA_Target_t *curr_target = NULL;
+    int mpi_errno = MPI_SUCCESS;
+
+    (*target) = NULL;
+
+    /* If we are in an aggressive cleanup, the window must be holding
+     * up resources.  If it isn't, we are in the wrong window and
+     * incorrectly entered this function. */
+    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER,
+                        "**rmanotarget");
+
+    if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
+        /* switch to window-wide protocol */
+        MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
+        MPIDI_Comm_get_vc(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &orig_vc);
+        for (i = 0; i < win_ptr->comm_ptr->local_size; i++) {
+            if (i == win_ptr->comm_ptr->rank)
+                continue;
+            MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
+            if (orig_vc->node_id != target_vc->node_id) {
+                mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, i, &curr_target);
+                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (curr_target == NULL) {
+                    win_ptr->outstanding_locks++;
+                    mpi_errno = send_lock_msg(i, MPI_LOCK_SHARED, win_ptr);
+                    if (mpi_errno != MPI_SUCCESS)
+                        MPIU_ERR_POP(mpi_errno);
+                }
+            }
+        }
+        win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_ISSUED;
+    }
+
+    do {
+        /* find a non-empty slot and set the FLUSH flag on the first
+         * target */
+        /* TODO: we should think about better strategies on selecting the target */
+        for (i = 0; i < win_ptr->num_slots; i++)
+            if (win_ptr->slots[i].target_list != NULL)
+                break;
+        curr_target = win_ptr->slots[i].target_list;
+        if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
+            curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
+            curr_target->sync.have_remote_incomplete_ops = 0;
+            curr_target->sync.outstanding_acks++;
+        }
+
+        /* Issue out all operations. */
+        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, curr_target->target_rank,
+                                                        &made_progress);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+
+        /* Wait for remote completion. */
+        do {
+            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target,
+                                                          &local_completed,
+                                                          &remote_completed);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+            if (!remote_completed) {
+                mpi_errno = wait_progress_engine();
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
+        } while (!remote_completed);
+
+        /* Cleanup the target. */
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_single_target(win_ptr, curr_target);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        /* check if we got a target */
+        (*target) = MPIDI_CH3I_Win_target_alloc(win_ptr);
+
+    } while ((*target) == NULL);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
 #define FUNCNAME MPIDI_CH3I_RMA_Make_progress_target
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index afcb39a..36d0d1f 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -73,7 +73,10 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
 
         /* queue it up */
         new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-        MPIU_ERR_CHKANDJUMP(new_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
+        if (new_ptr == NULL) {
+            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &new_ptr);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
 
         put_pkt = &(new_ptr->pkt.put);
         MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
@@ -183,7 +186,10 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
 
         /* queue it up */
         new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-        MPIU_ERR_CHKANDJUMP(new_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
+        if (new_ptr == NULL) {
+            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &new_ptr);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
 
         get_pkt = &(new_ptr->pkt.get);
         MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
@@ -294,7 +300,10 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
 
         /* queue it up */
         new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-        MPIU_ERR_CHKANDJUMP(new_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
+        if (new_ptr == NULL) {
+            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &new_ptr);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
 
         /* If predefined and contiguous, use a simplified element */
         if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
@@ -442,7 +451,10 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
 
         /* Append the operation to the window's RMA ops queue */
         new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-        MPIU_ERR_CHKANDJUMP(new_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
+        if (new_ptr == NULL) {
+            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &new_ptr);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
 
         /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */
 
@@ -578,7 +590,10 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
 
         /* Append this operation to the RMA ops queue */
         new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-        MPIU_ERR_CHKANDJUMP(new_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
+        if (new_ptr == NULL) {
+            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &new_ptr);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
 
         cas_pkt = &(new_ptr->pkt.cas);
         MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS);
@@ -670,7 +685,10 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
 
         /* Append this operation to the RMA ops queue */
         new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
-        MPIU_ERR_CHKANDJUMP(new_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
+        if (new_ptr == NULL) {
+            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr, &new_ptr);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        }
 
         fop_pkt = &(new_ptr->pkt.fop);
         MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);

http://git.mpich.org/mpich.git/commitdiff/ab058906941ac3342deb1c7407ab9db88f6b7dfe

commit ab058906941ac3342deb1c7407ab9db88f6b7dfe
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Nov 3 22:17:55 2014 -0600

    Add nonblocking progress making functions.
    
    Progress making functions check if current
    synchronization is finished, change synchronization
    state if possible, and issue pending operations
    on window as many as possible.
    
    There are three granularity of progress making functions:
    per-target, per-window and per-process. Per-target
    routine is used in RMA routine functions (PUT/GET/ACC...)
    and single passive lock (Win_unlock, Win_flush, Win_flush_local);
    per-window routine is used in window-wide synchronization
    calls (Win_fence, Win_complete, Win_unlock_all,
    Win_flush_all, Win_flush_local_all), and per-process
    routine is used in progress engine.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_progress.c b/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
index 13c2fa5..3df4f8e 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_progress.c
@@ -471,6 +471,13 @@ int MPIDI_CH3I_Progress (MPID_Progress_state *progress_state, int is_blocking)
         }
 #endif /* HAVE_LIBHCOLL */
 
+        /* make progress on RMA */
+        mpi_errno = MPIDI_CH3I_RMA_Make_progress_global(&made_progress);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+        if (made_progress)
+            MPIDI_CH3_Progress_signal_completion();
+
         /* in the case of progress_wait, bail out if anything completed (CC-1) */
         if (is_blocking) {
             int completion_count = OPA_load_int(&MPIDI_CH3I_progress_completion_count);
diff --git a/src/mpid/ch3/channels/sock/src/ch3_progress.c b/src/mpid/ch3/channels/sock/src/ch3_progress.c
index bfe2c21..c0cab1a 100644
--- a/src/mpid/ch3/channels/sock/src/ch3_progress.c
+++ b/src/mpid/ch3/channels/sock/src/ch3_progress.c
@@ -95,6 +95,11 @@ static int MPIDI_CH3i_Progress_test(void)
     }
 #endif /* HAVE_LIBHCOLL */
 
+    /* make progress on RMA */
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_global(&made_progress);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+
     mpi_errno = MPIDU_Sock_wait(MPIDI_CH3I_sock_set, 0, &event);
 
     if (mpi_errno == MPI_SUCCESS)
@@ -203,6 +208,15 @@ static int MPIDI_CH3i_Progress_wait(MPID_Progress_state * progress_state)
         }
 #endif /* HAVE_LIBHCOLL */
 
+        /* make progress on RMA */
+        mpi_errno = MPIDI_CH3I_RMA_Make_progress_global(&made_progress);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+        if (made_progress) {
+            MPIDI_CH3_Progress_signal_completion();
+            break;
+        }
+
 #       ifdef MPICH_IS_THREADED
 
 	/* The logic for this case is just complicated enough that
diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index f723750..6e125c2 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -10,6 +10,9 @@
 #include "mpl_utlist.h"
 #include "mpid_rma_types.h"
 
+int MPIDI_CH3I_RMA_Make_progress_target(MPID_Win * win_ptr, int target_rank, int *made_progress);
+int MPIDI_CH3I_RMA_Make_progress_win(MPID_Win * win_ptr, int *made_progress);
+
 extern struct MPIDI_RMA_Op *global_rma_op_pool, *global_rma_op_pool_tail, *global_rma_op_pool_start;
 extern struct MPIDI_RMA_Target *global_rma_target_pool, *global_rma_target_pool_tail, *global_rma_target_pool_start;
 
diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index 8f18aa3..875e97b 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -1827,6 +1827,8 @@ int MPIDI_CH3_PktHandler_Revoke(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
                                 MPIDI_msg_sz_t *buflen, MPID_Request **rreqp);
 int MPIDI_CH3_PktHandler_Init( MPIDI_CH3_PktHandler_Fcn *[], int );
 
+int MPIDI_CH3I_RMA_Make_progress_global(int *made_progress);
+
 #ifdef MPICH_DBG_OUTPUT
 int MPIDI_CH3_PktPrint_CancelSendReq( FILE *, MPIDI_CH3_Pkt_t * );
 int MPIDI_CH3_PktPrint_CancelSendResp( FILE *, MPIDI_CH3_Pkt_t * );
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index ad9307d..d8e0bc4 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -24,6 +24,10 @@ cvars:
 === END_MPI_T_CVAR_INFO_BLOCK ===
 */
 
+static inline int issue_rma_op(MPIDI_RMA_Op_t * op_ptr, MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags);
+static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *target, int *made_progress);
+static inline int issue_ops_win(MPID_Win * win_ptr, int *made_progress);
+
 static int send_flush_msg(int dest, MPID_Win *win_ptr);
 static int send_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags);
 static int send_contig_acc_msg(MPIDI_RMA_Op_t * rma_op,
@@ -32,6 +36,522 @@ static int recv_rma_msg(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr, MPIDI_CH3_P
 static int send_immed_rmw_msg(MPIDI_RMA_Op_t * rma_op,
                               MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags);
 
+#undef FUNCNAME
+#define FUNCNAME issue_rma_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int issue_rma_op(MPIDI_RMA_Op_t * op_ptr, MPID_Win * win_ptr,
+                               MPIDI_CH3_Pkt_flags_t flags)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_ISSUE_RMA_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_RMA_OP);
+
+    switch (op_ptr->pkt.type) {
+    case (MPIDI_CH3_PKT_PUT):
+    case (MPIDI_CH3_PKT_ACCUMULATE):
+    case (MPIDI_CH3_PKT_GET_ACCUM):
+        mpi_errno = send_rma_msg(op_ptr, win_ptr, flags);
+        break;
+    case (MPIDI_CH3_PKT_ACCUM_IMMED):
+        mpi_errno = send_contig_acc_msg(op_ptr, win_ptr, flags);
+        break;
+    case (MPIDI_CH3_PKT_GET):
+        mpi_errno = recv_rma_msg(op_ptr, win_ptr, flags);
+        break;
+    case (MPIDI_CH3_PKT_CAS):
+    case (MPIDI_CH3_PKT_FOP):
+        mpi_errno = send_immed_rmw_msg(op_ptr, win_ptr, flags);
+        break;
+        default:
+            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**winInvalidOp");
+        }
+
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+
+  fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_RMA_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+  fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+#undef FUNCNAME
+#define FUNCNAME check_window_state
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int check_window_state(MPID_Win *win_ptr, int *made_progress, int *cannot_issue)
+{
+    int i, mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_CHECK_WINDOW_STATE);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_CHECK_WINDOW_STATE);
+
+    (*made_progress) = 0;
+    (*cannot_issue) = 0;
+
+    if (win_ptr->states.access_state == MPIDI_RMA_NONE) {
+        (*cannot_issue) = 1;
+        goto fn_exit;
+    }
+    else if (win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED) {
+        MPID_Request *fence_req_ptr = NULL;
+        MPID_Request_get_ptr(win_ptr->fence_sync_req, fence_req_ptr);
+        if (MPID_Request_is_complete(fence_req_ptr)) {
+            win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
+            MPID_Request_release(fence_req_ptr);
+            win_ptr->fence_sync_req = MPI_REQUEST_NULL;
+
+            (*made_progress) = 1;
+        }
+        else {
+            (*cannot_issue) = 1;
+            goto fn_exit;
+        }
+    }
+    else if (win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED) {
+        if (win_ptr->start_req == NULL) {
+            /* for MPI_MODE_NOCHECK and all targets on SHM,
+               we do not create PSCW requests on window. */
+            win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;
+
+            (*made_progress) = 1;
+        }
+        else {
+            for (i = 0; i < win_ptr->start_grp_size; i++) {
+                MPID_Request *start_req_ptr = NULL;
+                if (win_ptr->start_req[i] == MPI_REQUEST_NULL)
+                    continue;
+                MPID_Request_get_ptr(win_ptr->start_req[i], start_req_ptr);
+                if (MPID_Request_is_complete(start_req_ptr)) {
+                    MPID_Request_release(start_req_ptr);
+                    win_ptr->start_req[i] = MPI_REQUEST_NULL;
+                }
+                else {
+                    (*cannot_issue) = 1;
+                    goto fn_exit;
+                }
+            }
+            MPIU_Assert(i == win_ptr->start_grp_size);
+            win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;
+
+            (*made_progress) = 1;
+
+            MPIU_Free(win_ptr->start_req);
+            win_ptr->start_req = NULL;
+        }
+    }
+    else if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED) {
+        if (win_ptr->outstanding_locks == 0) {
+            win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_GRANTED;
+            (*made_progress) = 1;
+        }
+        else {
+            (*cannot_issue) = 1;
+            goto fn_exit;
+        }
+    }
+
+  fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_CHECK_WINDOW_STATE);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+  fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
+
+#undef FUNCNAME
+#define FUNCNAME issue_ops_target
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *target,
+                                   int *made_progress)
+{
+    int rank = win_ptr->comm_ptr->rank;
+    MPIDI_RMA_Op_t *curr_op = NULL;
+    int first_op;
+    int mpi_errno = MPI_SUCCESS;
+
+    (*made_progress) = 0;
+
+    if (win_ptr->non_empty_slots == 0 || target == NULL)
+        goto fn_exit;
+
+    /* check per-target state */
+    if (win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
+        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
+        if (target->access_state == MPIDI_RMA_LOCK_CALLED) {
+            if (target->sync.sync_flag == MPIDI_RMA_SYNC_NONE ||
+                target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH_LOCAL ||
+                target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
+                if (target->pending_op_list != NULL &&
+                    target->pending_op_list->piggyback_lock_candidate) {
+                    /* Capable of piggybacking LOCK message with first operation. */
+                }
+                else {
+                    target->access_state = MPIDI_RMA_LOCK_ISSUED;
+                    target->outstanding_lock++;
+                    MPIU_Assert(target->outstanding_lock == 1);
+                    if (target->target_rank == rank) {
+                        mpi_errno = acquire_local_lock(win_ptr, target->lock_type);
+                        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                    }
+                    else {
+                        mpi_errno = send_lock_msg(target->target_rank,
+                                                  target->lock_type, win_ptr);
+                        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                    }
+                    (*made_progress) = 1;
+                    goto fn_exit;
+                }
+            }
+            else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
+                if (target->pending_op_list != NULL) {
+                    /* Capable of piggybacking LOCK message with first operation. */
+                    MPIU_Assert(target->pending_op_list->piggyback_lock_candidate);
+                }
+                else {
+                    /* No RMA operation has ever been posted to this target,
+                       finish issuing, no need to acquire the lock. Cleanup
+                       function will clean it up. */
+                    target->sync.outstanding_acks--;
+                    MPIU_Assert(target->sync.outstanding_acks == 0);
+                    (*made_progress) = 1;
+
+                    /* Unset target's sync_flag. */
+                    target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;
+                    goto fn_exit;
+                }
+            }
+        }
+        else if (target->access_state == MPIDI_RMA_LOCK_ISSUED) {
+            if (target->outstanding_lock == 0) {
+                target->access_state = MPIDI_RMA_LOCK_GRANTED;
+                (*made_progress) = 1;
+            }
+            else
+                goto fn_exit;
+        }
+    }
+
+    MPIU_Assert(win_ptr->states.access_state == MPIDI_RMA_FENCE_GRANTED ||
+                win_ptr->states.access_state == MPIDI_RMA_PSCW_GRANTED ||
+                win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
+                win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED ||
+                win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_GRANTED);
+
+     if (win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
+        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
+        MPIU_Assert(target->access_state == MPIDI_RMA_LOCK_CALLED ||
+                    target->access_state == MPIDI_RMA_LOCK_GRANTED);
+    }
+
+    /* Deal with when there is no operation in the list. */
+    if (target->pending_op_list == NULL) {
+
+        /* At this point, per-target state must be LOCK_GRANTED. */
+        if (win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
+            win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
+            MPIU_Assert(target->access_state == MPIDI_RMA_LOCK_GRANTED);
+        }
+
+        if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
+            if (target->target_rank == rank) {
+                target->sync.outstanding_acks--;
+                MPIU_Assert(target->sync.outstanding_acks == 0);
+            }
+            else {
+                mpi_errno = send_flush_msg(target->target_rank, win_ptr);
+                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            }
+        }
+        else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
+            if (target->target_rank == rank) {
+                mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
+                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                target->sync.outstanding_acks--;
+                MPIU_Assert(target->sync.outstanding_acks == 0);
+            }
+            else {
+                mpi_errno = send_unlock_msg(target->target_rank, win_ptr);
+                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            }
+        }
+        (*made_progress) = 1;
+        goto finish_issue;
+    }
+
+    /* Issue out operations in the list. */
+    first_op = 1;
+    curr_op = target->next_op_to_issue;
+    while (curr_op != NULL) {
+        MPIDI_CH3_Pkt_flags_t flags = MPIDI_CH3_PKT_FLAG_NONE;
+
+        if (target->access_state == MPIDI_RMA_LOCK_ISSUED)
+            goto fn_exit;
+
+        if (curr_op->next == NULL &&
+            target->sync.sync_flag == MPIDI_RMA_SYNC_NONE) {
+            /* skip last OP. */
+            goto finish_issue;
+        }
+
+        if (first_op) {
+            /* piggyback on first OP. */
+            if (target->access_state == MPIDI_RMA_LOCK_CALLED) {
+                MPIU_Assert(curr_op->piggyback_lock_candidate);
+                flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK;
+                target->access_state = MPIDI_RMA_LOCK_ISSUED;
+                target->outstanding_lock++;
+                MPIU_Assert(target->outstanding_lock == 1);
+            }
+            first_op = 0;
+        }
+
+        if (curr_op->next == NULL) {
+            /* piggyback on last OP. */
+            if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) {
+                flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
+                if (target->win_complete_flag)
+                    flags |= MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER;
+            }
+            else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
+                flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK;
+            }
+        }
+
+        target->next_op_to_issue = curr_op->next;
+
+        mpi_errno = issue_rma_op(curr_op, win_ptr, target, flags);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+
+        if (!curr_op->request) {
+            /* Sending is completed immediately. */
+            MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, &(target->pending_op_list),
+                                         &(target->pending_op_list_tail), curr_op);
+        }
+        else {
+            /* Sending is not completed immediately. */
+            MPIDI_CH3I_RMA_Ops_unlink(&(target->pending_op_list),
+                                      &(target->pending_op_list_tail), curr_op);
+            if (curr_op->is_dt) {
+                MPIDI_CH3I_RMA_Ops_append(&(target->dt_op_list),
+                                          &(target->dt_op_list_tail), curr_op);
+            }
+            else if (curr_op->pkt.type == MPIDI_CH3_PKT_PUT ||
+                     curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
+                     curr_op->pkt.type == MPIDI_CH3_PKT_ACCUM_IMMED) {
+                MPIDI_CH3I_RMA_Ops_append(&(target->write_op_list),
+                                          &(target->write_op_list_tail), curr_op);
+            }
+            else {
+                MPIDI_CH3I_RMA_Ops_append(&(target->read_op_list),
+                                          &(target->read_op_list_tail), curr_op);
+            }
+        }
+
+        curr_op = target->next_op_to_issue;
+
+        (*made_progress) = 1;
+    }
+
+ finish_issue:
+    /* Unset target's sync_flag. */
+    target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+#undef FUNCNAME
+#define FUNCNAME issue_ops_win
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int issue_ops_win(MPID_Win *win_ptr, int *made_progress)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int start_slot, end_slot, i;
+    MPIDI_RMA_Target_t *target = NULL;
+
+    (*made_progress) = 0;
+
+    if (win_ptr->non_empty_slots == 0)
+        goto fn_exit;
+
+    MPIU_Assert(win_ptr->states.access_state == MPIDI_RMA_FENCE_GRANTED ||
+                win_ptr->states.access_state == MPIDI_RMA_PSCW_GRANTED ||
+                win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
+                win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED ||
+                win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_GRANTED);
+
+    start_slot = win_ptr->comm_ptr->rank % win_ptr->num_slots;
+    end_slot = start_slot + win_ptr->num_slots;
+
+    for (i = start_slot; i < end_slot; i++) {
+        int idx;
+        if (i >= win_ptr->num_slots) idx = i - win_ptr->num_slots;
+        else idx = i;
+
+        target = win_ptr->slots[idx].target_list;
+        while (target != NULL) {
+            int temp = 0;
+            mpi_errno = issue_ops_target(win_ptr, target, &temp);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+            if (temp)
+                (*made_progress) = 1;
+
+            target = target->next;
+        }
+    }
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_RMA_Make_progress_target
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3I_RMA_Make_progress_target(MPID_Win * win_ptr, int target_rank, int *made_progress)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int cannot_issue = 0, temp_progress = 0;
+    MPIDI_RMA_Slot_t *slot;
+    MPIDI_RMA_Target_t *target;
+
+    (*made_progress) = 0;
+
+    if (win_ptr->num_slots < win_ptr->comm_ptr->local_size) {
+        slot = &(win_ptr->slots[target_rank % win_ptr->num_slots]);
+        for (target = slot->target_list;
+             target && target->target_rank != target_rank; target = target->next);
+    }
+    else {
+        slot = &(win_ptr->slots[target_rank]);
+        target = slot->target_list;
+    }
+
+    if (target != NULL) {
+
+        /* check window state */
+        mpi_errno = check_window_state(win_ptr, &temp_progress, &cannot_issue);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        if (temp_progress)
+            (*made_progress) = 1;
+
+        if (cannot_issue)
+            goto fn_exit;
+
+        mpi_errno = issue_ops_target(win_ptr, target, &temp_progress);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+
+        if (temp_progress)
+            (*made_progress) = 1;
+    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_RMA_Make_progress_win
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3I_RMA_Make_progress_win(MPID_Win * win_ptr, int *made_progress)
+{
+    int temp_progress = 0, cannot_issue = 0;
+    int mpi_errno = MPI_SUCCESS;
+
+    (*made_progress) = 0;
+
+    /* check window state */
+    mpi_errno = check_window_state(win_ptr, &temp_progress, &cannot_issue);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+    if (temp_progress)
+        (*made_progress) = 1;
+
+    if (cannot_issue)
+        goto fn_exit;
+
+    mpi_errno = issue_ops_win(win_ptr, &temp_progress);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+
+    if (temp_progress)
+        (*made_progress) = 1;
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_RMA_Make_progress_global
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3I_RMA_Make_progress_global(int *made_progress)
+{
+    MPIDI_RMA_Win_list_t *win_elem = MPIDI_RMA_Win_list;
+    int tmp = 0, cannot_issue = 0;
+    int mpi_errno = MPI_SUCCESS;
+
+    (*made_progress) = 0;
+
+    for (win_elem = MPIDI_RMA_Win_list; win_elem; win_elem = win_elem->next) {
+        if (win_elem->win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
+            win_elem->win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
+            win_elem->win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
+            win_elem->win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED ||
+            win_elem->win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED ||
+            win_elem->win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_GRANTED) {
+
+            /* check window state */
+            mpi_errno = check_window_state(win_elem->win_ptr, &tmp, &cannot_issue);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+            if (tmp)
+                (*made_progress) = 1;
+
+            if (cannot_issue)
+                continue;
+
+            mpi_errno = issue_ops_win(win_elem->win_ptr, &tmp);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+            if (tmp)
+                (*made_progress) = 1;
+        }
+    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
 /* create_datatype() creates a new struct datatype for the dtype_info
    and the dataloop of the target datatype together with the user data */
 #undef FUNCNAME

http://git.mpich.org/mpich.git/commitdiff/ebee0b71021afeef13504a2ce38f76e2a555344b

commit ebee0b71021afeef13504a2ce38f76e2a555344b
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Oct 27 05:38:17 2014 -0500

    Add nonblocking ops / targets GC functions.
    
    Here we implement garbage collection functions for
    both operations and targets. There are two level of GC
    functions: per-target and per-window. Per-target functions
    are used in single passive lock ending calls: Win_unlock;
    per-window functions are used in window-wide ending
    calls: Win_fence, Win_complete, Win_unlock_all.
    
    Garbage collection functions for RMA ops go over all
    incomplete operation lists in target element and free
    completed operations. It also returns flags indicating
    local completion and remote completion.
    
    Garbage collection functions for RMA targets go over
    all targets and free those targets that have compeleted empty
    operation lists.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index ed904f3..f723750 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -299,6 +299,214 @@ static inline int MPIDI_CH3I_Win_target_dequeue_and_free(MPID_Win * win_ptr,
     goto fn_exit;
 }
 
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_ops_target
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_RMA_Cleanup_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *target,
+                                                    int *local_completed, int *remote_completed)
+{
+    MPIDI_RMA_Op_t *curr_op = NULL;
+    MPIDI_RMA_Op_t **op_list = NULL, **op_list_tail = NULL;
+    int read_flag = 0, write_flag = 0;
+    int mpi_errno = MPI_SUCCESS;
+
+    (*local_completed) = 0;
+    (*remote_completed) = 0;
+
+    if (win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
+        win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
+        win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED)
+        goto fn_exit;
+
+    if (target == NULL)
+        goto fn_exit;
+
+    if (target->access_state == MPIDI_RMA_LOCK_CALLED ||
+        target->access_state == MPIDI_RMA_LOCK_ISSUED)
+        goto fn_exit;
+
+    if (target->pending_op_list == NULL &&
+        target->read_op_list == NULL &&
+        target->write_op_list == NULL &&
+        target->dt_op_list == NULL)
+        goto cleanup_target;
+
+    if (target->read_op_list != NULL) {
+        op_list = &(target->read_op_list);
+        op_list_tail = &(target->read_op_list_tail);
+        read_flag = 1;
+    }
+    else if (target->write_op_list != NULL) {
+        op_list = &(target->write_op_list);
+        op_list_tail = &(target->write_op_list_tail);
+        write_flag = 1;
+    }
+    else if (target->dt_op_list != NULL) {
+        op_list = &(target->dt_op_list);
+        op_list_tail = &(target->dt_op_list_tail);
+    }
+    else {
+        /* only pending op list is not NULL, nothing we can do here. */
+        goto fn_exit;
+    }
+
+    curr_op = *op_list;
+    while (curr_op != NULL) {
+        if (MPID_Request_is_complete(curr_op->request)) {
+            /* If there's an error, return it */
+            mpi_errno = curr_op->request->status.MPI_ERROR;
+            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
+
+            /* No errors, free the request */
+            MPID_Request_release(curr_op->request);
+
+            /* dequeue the operation and free it */
+            MPL_LL_DELETE(*op_list, *op_list_tail, curr_op);
+            MPIDI_CH3I_Win_op_free(win_ptr, curr_op);
+            if (*op_list == NULL) {
+                if (read_flag == 1) {
+                    read_flag = 0;
+                    if (target->write_op_list != NULL) {
+                        op_list = &(target->write_op_list);
+                        op_list_tail = &(target->write_op_list_tail);
+                        write_flag = 1;
+                    }
+                    else if (target->dt_op_list != NULL) {
+                        op_list = &(target->dt_op_list);
+                        op_list_tail = &(target->dt_op_list_tail);
+                    }
+                    else
+                        break;
+                }
+                else if (write_flag == 1) {
+                    write_flag = 0;
+                    if (target->dt_op_list != NULL) {
+                        op_list = &(target->dt_op_list);
+                        op_list_tail = &(target->dt_op_list_tail);
+                    }
+                    else
+                        break;
+                }
+            }
+            /* next op */
+            curr_op = *op_list;
+        }
+        else
+            break;
+    }
+
+  cleanup_target:
+    if (target->pending_op_list == NULL &&
+        target->read_op_list == NULL && target->write_op_list == NULL &&
+        target->dt_op_list == NULL) {
+
+        (*local_completed) = 1;
+
+        /* for the conditions that need to be satisfied before we free the
+         * target, see the MPIDI_RMA_Target definition in
+         * mpid_rma_types.h */
+        if (target->sync.sync_flag == MPIDI_RMA_SYNC_NONE &&
+            target->sync.outstanding_acks == 0 &&
+            target->sync.have_remote_incomplete_ops == 0) {
+            (*remote_completed) = 1;
+        }
+    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_ops_win
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_RMA_Cleanup_ops_win(MPID_Win *win_ptr,
+                                                 int *local_completed, int *remote_completed)
+{
+    MPIDI_RMA_Target_t *target = NULL;
+    int num_targets = 0, local_completed_targets = 0, remote_completed_targets = 0;
+    int i, mpi_errno = MPI_SUCCESS;
+
+    (*local_completed) = 0;
+    (*remote_completed) = 0;
+
+    for (i = 0; i < win_ptr->num_slots; i++) {
+        for (target = win_ptr->slots[i].target_list; target; ) {
+            int local = 0, remote = 0;
+
+            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, target, &local, &remote);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+            num_targets++;
+            local_completed_targets += local;
+            remote_completed_targets += remote;
+
+            target = target->next;
+        }
+    }
+
+    if (num_targets == local_completed_targets)
+        (*local_completed) = 1;
+    if (num_targets == remote_completed_targets)
+        (*remote_completed) = 1;
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_single_target
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_RMA_Cleanup_single_target(MPID_Win *win_ptr, MPIDI_RMA_Target_t *target)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    /* dequeue the target and free it. */
+    mpi_errno = MPIDI_CH3I_Win_target_dequeue_and_free(win_ptr, target);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_targets_win
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_RMA_Cleanup_targets_win(MPID_Win *win_ptr)
+{
+    MPIDI_RMA_Target_t *target = NULL, *next_target = NULL;
+    int i, mpi_errno = MPI_SUCCESS;
+
+    for (i = 0; i < win_ptr->num_slots; i++) {
+        for (target = win_ptr->slots[i].target_list; target; ) {
+            next_target = target->next;
+            mpi_errno = MPIDI_CH3I_RMA_Cleanup_single_target(win_ptr, target);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            target = next_target;
+        }
+    }
+
+    MPIU_Assert(win_ptr->non_empty_slots == 0);
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
 /* Return nonzero if the RMA operations list is empty.
  */
 #undef FUNCNAME

http://git.mpich.org/mpich.git/commitdiff/f91d46333ed6a414535780aeaee2075cf83c7ab3

commit f91d46333ed6a414535780aeaee2075cf83c7ab3
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Oct 27 05:58:15 2014 -0500

    Keep track of no. of non-empty slots on window.
    
    Keep track of no. of non-empty slots on window so that
    when number is 0, there are no operations needed to
    be processed and we can ignore that window.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index bc74770..ed904f3 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -167,6 +167,9 @@ static inline int MPIDI_CH3I_Win_create_target(MPID_Win * win_ptr, int target_ra
 
     t->target_rank = target_rank;
 
+    if (slot->target_list == NULL)
+        win_ptr->non_empty_slots++;
+
     /* Enqueue target into target list. */
     MPL_LL_APPEND(slot->target_list, slot->target_list_tail, t);
 
@@ -287,6 +290,9 @@ static inline int MPIDI_CH3I_Win_target_dequeue_and_free(MPID_Win * win_ptr,
     mpi_errno = MPIDI_CH3I_Win_target_free(win_ptr, e);
     if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
 
+    if (slot->target_list == NULL)
+        win_ptr->non_empty_slots--;
+
  fn_exit:
     return mpi_errno;
  fn_fail:
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index 1da58fd..c2ab3a0 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -367,6 +367,7 @@ struct MPIDI_Win_target_state {
         enum MPIDI_RMA_states access_state;                              \
         enum MPIDI_RMA_states exposure_state;                            \
     } states;                                                            \
+    int non_empty_slots;                                                 \
 
 #ifdef MPIDI_CH3_WIN_DECL
 #define MPID_DEV_WIN_DECL \
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index d683d1a..c950116 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -331,6 +331,7 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->shm_allocated = FALSE;
     (*win_ptr)->states.access_state = MPIDI_RMA_NONE;
     (*win_ptr)->states.exposure_state = MPIDI_RMA_NONE;
+    (*win_ptr)->non_empty_slots = 0;
 
     /* Initialize the passive target lock state */
     MPIU_CHKPMEM_MALLOC((*win_ptr)->targets, struct MPIDI_Win_target_state *,

http://git.mpich.org/mpich.git/commitdiff/f076f3febb5890cda36d2f1f06264c14495f4c4b

commit f076f3febb5890cda36d2f1f06264c14495f4c4b
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sat Nov 1 14:52:29 2014 -0500

    Add new RMA states on window / target and modify state checking.
    
    We define new states to indicate the current situation of
    RMA synchronization. The states contain both ACCESS states
    and EXPOPSURE states, and specify if the synchronization
    is initialized (_CALLED), on-going (_ISSUED) and completed
    (_GRANTED). For single lock in Passive Target, we use
    per-target state whereas the window state is set to PER_TARGET.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 117d2ce..bc74770 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -103,6 +103,7 @@ static inline MPIDI_RMA_Target_t *MPIDI_CH3I_Win_target_alloc(MPID_Win * win_ptr
     e->next_op_to_issue = NULL;
 
     e->target_rank = -1;
+    e->access_state = MPIDI_RMA_NONE;
     e->lock_type = MPIDI_RMA_LOCK_TYPE_NONE;
     e->lock_mode = 0;
     e->outstanding_lock = 0;
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index 9425152..2158320 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -91,6 +91,7 @@ typedef struct MPIDI_RMA_Target {
     struct MPIDI_RMA_Op *next_op_to_issue;
     struct MPIDI_RMA_Target *next;
     int target_rank;
+    enum MPIDI_RMA_states access_state;
     enum MPIDI_RMA_Lock_type lock_type; /* SHARED, EXCLUSIVE */
     int lock_mode;              /* e.g., MODE_NO_CHECK */
     int outstanding_lock;
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index 3826592..1da58fd 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -237,6 +237,28 @@ enum MPIDI_RMA_sync_types {
 /* We start with an arbitrarily chosen number (42), to help with
  * debugging when a packet type is not initialized or wrongly
  * initialized. */
+enum MPIDI_RMA_states {
+    /* window-wide states */
+    MPIDI_RMA_NONE = 42,
+    MPIDI_RMA_FENCE_ISSUED,           /* access / exposure */
+    MPIDI_RMA_FENCE_GRANTED,          /* access / exposure */
+    MPIDI_RMA_PSCW_ISSUED,            /* access */
+    MPIDI_RMA_PSCW_GRANTED,           /* access */
+    MPIDI_RMA_PSCW_EXPO,              /* exposure */
+    MPIDI_RMA_PER_TARGET,             /* access */
+    MPIDI_RMA_LOCK_ALL_CALLED,        /* access */
+    MPIDI_RMA_LOCK_ALL_ISSUED,        /* access */
+    MPIDI_RMA_LOCK_ALL_GRANTED,       /* access */
+
+    /* target-specific states */
+    MPIDI_RMA_LOCK_CALLED,            /* access */
+    MPIDI_RMA_LOCK_ISSUED,            /* access */
+    MPIDI_RMA_LOCK_GRANTED,           /* access */
+};
+
+/* We start with an arbitrarily chosen number (42), to help with
+ * debugging when a packet type is not initialized or wrongly
+ * initialized. */
 enum MPIDI_CH3_Lock_states {
     MPIDI_CH3_WIN_LOCK_NONE = 42,
     MPIDI_CH3_WIN_LOCK_CALLED,
@@ -341,6 +363,10 @@ struct MPIDI_Win_target_state {
     struct MPIDI_RMA_Target *target_pool_tail; /* tail pointer to pool of targets */\
     struct MPIDI_RMA_Slot *slots;                                        \
     int num_slots;                                                       \
+    struct {                                                             \
+        enum MPIDI_RMA_states access_state;                              \
+        enum MPIDI_RMA_states exposure_state;                            \
+    } states;                                                            \
 
 #ifdef MPIDI_CH3_WIN_DECL
 #define MPID_DEV_WIN_DECL \
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index d4e54e6..afcb39a 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -29,17 +29,13 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_PUT);
 
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
+                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+
     if (target_rank == MPI_PROC_NULL) {
         goto fn_exit;
     }
 
-    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
-        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
-    }
-
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
-                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
-
     MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
 
     if (data_sz == 0) {
@@ -143,17 +139,13 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET);
 
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
+                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+
     if (target_rank == MPI_PROC_NULL) {
         goto fn_exit;
     }
 
-    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
-        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
-    }
-
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
-                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
-
     MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
 
     if (data_sz == 0) {
@@ -257,17 +249,13 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE);
 
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
+                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+
     if (target_rank == MPI_PROC_NULL) {
         goto fn_exit;
     }
 
-    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
-        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
-    }
-
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
-                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
-
     MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
 
     if (data_sz == 0) {
@@ -409,17 +397,13 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET_ACCUMULATE);
 
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
+                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+
     if (target_rank == MPI_PROC_NULL) {
         goto fn_exit;
     }
 
-    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
-        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
-    }
-
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
-                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
-
     MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb);
 
     if (data_sz == 0) {
@@ -548,19 +532,16 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
 
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_COMPARE_AND_SWAP);
+
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_COMPARE_AND_SWAP);
 
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
+                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+
     if (target_rank == MPI_PROC_NULL) {
         goto fn_exit;
     }
 
-    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
-        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
-    }
-
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
-                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
-
     rank = win_ptr->comm_ptr->rank;
 
     if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
@@ -644,19 +625,16 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
 
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_FETCH_AND_OP);
+
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_FETCH_AND_OP);
 
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
+                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+
     if (target_rank == MPI_PROC_NULL) {
         goto fn_exit;
     }
 
-    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
-        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
-    }
-
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
-                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
-
     rank = win_ptr->comm_ptr->rank;
 
     if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
diff --git a/src/mpid/ch3/src/ch3u_rma_reqops.c b/src/mpid/ch3/src/ch3u_rma_reqops.c
index 13a51f0..b6942a6 100644
--- a/src/mpid/ch3/src/ch3u_rma_reqops.c
+++ b/src/mpid/ch3/src/ch3u_rma_reqops.c
@@ -34,8 +34,10 @@ static int MPIDI_CH3I_Rma_req_poll(void *state, MPI_Status * status)
      * is still active first; the user could complete the request after calling
      * unlock. */
     /* FIXME: We need per-operation completion to make this more efficient. */
-    if (req_state->win_ptr->targets[req_state->target_rank].remote_lock_state
-        != MPIDI_CH3_WIN_LOCK_NONE) {
+    if (req_state->win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
+        req_state->win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED ||
+        req_state->win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED ||
+        req_state->win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_GRANTED) {
         mpi_errno = req_state->win_ptr->RMAFns.Win_flush(req_state->target_rank,
                                                          req_state->win_ptr);
     }
@@ -156,9 +158,12 @@ int MPIDI_Rput(const void *origin_addr, int origin_count,
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_RPUT);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_LOCK &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_LOCK_ALL &&
-                        target_rank != MPI_PROC_NULL, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+    /* request-based RMA operations are only valid within a passive epoch */
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_PER_TARGET &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_CALLED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
+                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
     MPIU_CHKPMEM_MALLOC(req_state, MPIDI_CH3I_Rma_req_state_t *,
                         sizeof(MPIDI_CH3I_Rma_req_state_t), mpi_errno, "req-based RMA state");
@@ -243,9 +248,12 @@ int MPIDI_Rget(void *origin_addr, int origin_count,
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_RGET);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_LOCK &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_LOCK_ALL &&
-                        target_rank != MPI_PROC_NULL, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+    /* request-based RMA operations are only valid within a passive epoch */
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_PER_TARGET &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_CALLED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
+                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
     MPIU_CHKPMEM_MALLOC(req_state, MPIDI_CH3I_Rma_req_state_t *,
                         sizeof(MPIDI_CH3I_Rma_req_state_t), mpi_errno, "req-based RMA state");
@@ -330,9 +338,12 @@ int MPIDI_Raccumulate(const void *origin_addr, int origin_count,
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_RACCUMULATE);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_LOCK &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_LOCK_ALL &&
-                        target_rank != MPI_PROC_NULL, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+    /* request-based RMA operations are only valid within a passive epoch */
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_PER_TARGET &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_CALLED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
+                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
     MPIU_CHKPMEM_MALLOC(req_state, MPIDI_CH3I_Rma_req_state_t *,
                         sizeof(MPIDI_CH3I_Rma_req_state_t), mpi_errno, "req-based RMA state");
@@ -418,9 +429,12 @@ int MPIDI_Rget_accumulate(const void *origin_addr, int origin_count,
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_RGET_ACCUMULATE);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_LOCK &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_LOCK_ALL &&
-                        target_rank != MPI_PROC_NULL, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+    /* request-based RMA operations are only valid within a passive epoch */
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_PER_TARGET &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_CALLED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
+                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
     MPIU_CHKPMEM_MALLOC(req_state, MPIDI_CH3I_Rma_req_state_t *,
                         sizeof(MPIDI_CH3I_Rma_req_state_t), mpi_errno, "req-based RMA state");
diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index b63e956..13c88ff 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -2102,8 +2102,10 @@ int MPIDI_Win_sync(MPID_Win * win_ptr)
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_SYNC);
 
-    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state != MPIDI_EPOCH_LOCK &&
-                        win_ptr->epoch_state != MPIDI_EPOCH_LOCK_ALL,
+    MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_PER_TARGET &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_CALLED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_ISSUED &&
+                        win_ptr->states.access_state != MPIDI_RMA_LOCK_ALL_GRANTED,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
     OPA_read_write_barrier();
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index 512757a..d683d1a 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -329,6 +329,8 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->at_rma_ops_list = NULL;
     (*win_ptr)->at_rma_ops_list_tail = NULL;
     (*win_ptr)->shm_allocated = FALSE;
+    (*win_ptr)->states.access_state = MPIDI_RMA_NONE;
+    (*win_ptr)->states.exposure_state = MPIDI_RMA_NONE;
 
     /* Initialize the passive target lock state */
     MPIU_CHKPMEM_MALLOC((*win_ptr)->targets, struct MPIDI_Win_target_state *,
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index e8743a6..b2d851d 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -138,7 +138,8 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FREE);
 
-    MPIU_ERR_CHKANDJUMP((*win_ptr)->epoch_state != MPIDI_EPOCH_NONE,
+    MPIU_ERR_CHKANDJUMP((*win_ptr)->states.access_state != MPIDI_RMA_NONE ||
+                        (*win_ptr)->states.exposure_state != MPIDI_RMA_NONE,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
     if (!(*win_ptr)->shm_allocated) {

http://git.mpich.org/mpich.git/commitdiff/7eac974fa6cd23ddf63457b0b964b3724fefa1f2

commit 7eac974fa6cd23ddf63457b0b964b3724fefa1f2
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Oct 27 00:58:40 2014 -0500

    Add a flag in op struct to indicate derived datatype.
    
    Add flag is_dt in op structure which is set when any
    buffers involved in RMA operations contains derived
    datatype data. It is convenient for us to enqueue
    issued but not completed operation to the DT specific
    list.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 87e64bf..117d2ce 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -39,6 +39,7 @@ static inline MPIDI_RMA_Op_t *MPIDI_CH3I_Win_op_alloc(MPID_Win * win_ptr)
 
     e->dataloop = NULL;
     e->request = NULL;
+    e->is_dt = 0;
 
     return e;
 }
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index c378037..9425152 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -80,6 +80,7 @@ typedef struct MPIDI_RMA_Op {
 
     MPIDI_CH3_Pkt_t pkt;
     MPIDI_RMA_Pool_type_t pool_type;
+    int is_dt;
 } MPIDI_RMA_Op_t;
 
 typedef struct MPIDI_RMA_Target {
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index e862b29..d4e54e6 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -104,10 +104,12 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
             MPID_Datatype_get_ptr(origin_datatype, dtp);
             MPID_Datatype_add_ref(dtp);
+            new_ptr->is_dt = 1;
         }
         if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
             MPID_Datatype_get_ptr(target_datatype, dtp);
             MPID_Datatype_add_ref(dtp);
+            new_ptr->is_dt = 1;
         }
     }
 
@@ -216,10 +218,12 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
             MPID_Datatype_get_ptr(origin_datatype, dtp);
             MPID_Datatype_add_ref(dtp);
+            new_ptr->is_dt = 1;
         }
         if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
             MPID_Datatype_get_ptr(target_datatype, dtp);
             MPID_Datatype_add_ref(dtp);
+            new_ptr->is_dt = 1;
         }
     }
 
@@ -364,10 +368,12 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
             MPID_Datatype_get_ptr(origin_datatype, dtp);
             MPID_Datatype_add_ref(dtp);
+            new_ptr->is_dt = 1;
         }
         if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
             MPID_Datatype_get_ptr(target_datatype, dtp);
             MPID_Datatype_add_ref(dtp);
+            new_ptr->is_dt = 1;
         }
     }
 
@@ -504,14 +510,17 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
         if (op != MPI_NO_OP && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
             MPID_Datatype_get_ptr(origin_datatype, dtp);
             MPID_Datatype_add_ref(dtp);
+            new_ptr->is_dt = 1;
         }
         if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
             MPID_Datatype_get_ptr(result_datatype, dtp);
             MPID_Datatype_add_ref(dtp);
+            new_ptr->is_dt = 1;
         }
         if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
             MPID_Datatype_get_ptr(target_datatype, dtp);
             MPID_Datatype_add_ref(dtp);
+            new_ptr->is_dt = 1;
         }
     }
 

http://git.mpich.org/mpich.git/commitdiff/1d873639602a045120b35a960d9068ecc5b4647b

commit 1d873639602a045120b35a960d9068ecc5b4647b
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Nov 2 22:12:43 2014 -0600

    Add global window list.
    
    Add a list of created windows on this process,
    so that we can make progress on all windows in
    the progress engine.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index 7f18385..c378037 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -124,6 +124,13 @@ typedef struct MPIDI_RMA_Slot {
     struct MPIDI_RMA_Target *target_list_tail;
 } MPIDI_RMA_Slot_t;
 
+typedef struct MPIDI_RMA_Win_list {
+    MPID_Win *win_ptr;
+    struct MPIDI_RMA_Win_list *next;
+} MPIDI_RMA_Win_list_t;
+
+extern MPIDI_RMA_Win_list_t *MPIDI_RMA_Win_list, *MPIDI_RMA_Win_list_tail;
+
 typedef struct MPIDI_PT_single_op {
     MPIDI_CH3_Pkt_type_t type;  /* put, get, or accum. */
     void *addr;
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index ba39bdc..512757a 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -29,6 +29,8 @@ cvars:
 
 MPIU_THREADSAFE_INIT_DECL(initRMAoptions);
 
+MPIDI_RMA_Win_list_t *MPIDI_RMA_Win_list = NULL, *MPIDI_RMA_Win_list_tail = NULL;
+
 static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
                     MPID_Comm * comm_ptr, MPID_Win ** win_ptr);
 
@@ -277,7 +279,8 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     int i;
     MPID_Comm *win_comm_ptr;
     int win_target_pool_size;
-    MPIU_CHKPMEM_DECL(4);
+    MPIDI_RMA_Win_list_t *win_elem;
+    MPIU_CHKPMEM_DECL(5);
     MPIDI_STATE_DECL(MPID_STATE_WIN_INIT);
 
     MPIDI_FUNC_ENTER(MPID_STATE_WIN_INIT);
@@ -378,6 +381,12 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
 
     MPID_WIN_FTABLE_SET_DEFAULTS(win_ptr);
 
+    /* enqueue window into the global list */
+    MPIU_CHKPMEM_MALLOC(win_elem, MPIDI_RMA_Win_list_t *, sizeof(MPIDI_RMA_Win_list_t), mpi_errno,
+                        "Window list element");
+    win_elem->win_ptr = *win_ptr;
+    MPL_LL_APPEND(MPIDI_RMA_Win_list, MPIDI_RMA_Win_list_tail, win_elem);
+
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_WIN_INIT);
     return mpi_errno;
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index 29a90ff..e8743a6 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -133,6 +133,7 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
     int in_use;
     MPID_Comm *comm_ptr;
     int errflag = FALSE;
+    MPIDI_RMA_Win_list_t *win_elem;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FREE);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FREE);
@@ -148,6 +149,13 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
             MPIU_ERR_POP(mpi_errno);
     }
 
+    /* dequeue window from the global list */
+    for (win_elem = MPIDI_RMA_Win_list; win_elem && win_elem->win_ptr != *win_ptr;
+         win_elem = win_elem->next);
+    MPIU_ERR_CHKANDJUMP(win_elem == NULL, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
+    MPL_LL_DELETE(MPIDI_RMA_Win_list, MPIDI_RMA_Win_list_tail, win_elem);
+    MPIU_Free(win_elem);
+
     comm_ptr = (*win_ptr)->comm_ptr;
     mpi_errno = MPIR_Comm_free_impl(comm_ptr);
     if (mpi_errno)

http://git.mpich.org/mpich.git/commitdiff/079a516bf9233f5c8b560a99479e5bfb73f64a52

commit 079a516bf9233f5c8b560a99479e5bfb73f64a52
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Nov 2 22:08:37 2014 -0600

    Add routine to enqueue op to RMA slots.
    
    Given an RMA op, finding the correct slot and target,
    enqueue op to the pending op list in that target object.
    If the target is not existed, create one in that slot.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 12853c5..87e64bf 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -210,6 +210,57 @@ static inline int MPIDI_CH3I_Win_find_target(MPID_Win * win_ptr, int target_rank
     goto fn_exit;
 }
 
+/* MPIDI_CH3I_Win_enqueue_op(): given an operation, enqueue it to the
+ * corresponding operation lists in corresponding target element. This
+ * routines is only called from operation routines. */
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Win_enqueue_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Win_enqueue_op(MPID_Win * win_ptr,
+                                            MPIDI_RMA_Op_t * op)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_RMA_Target_t *target = NULL;
+
+    mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, op->target_rank, &target);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (target == NULL) {
+        mpi_errno = MPIDI_CH3I_Win_create_target(win_ptr, op->target_rank, &target);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        if (win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
+            win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_GRANTED) {
+            /* If global state is MPIDI_RMA_PER_TARGET, this must not
+             * be the first time to create this target (The first time
+             * is in Win_lock). Here we recreated it and set the access
+             * state to LOCK_GRANTED because before we free the previous
+             * one, the lock should already be granted. */
+            /* If global state is MPIDI_RMA_LOCK_ALL_GRANTED, all locks
+             * should already be granted. So the access state for this
+             * target is also set to MPIDI_RMA_LOCK_GRANTED. */
+            target->access_state = MPIDI_RMA_LOCK_GRANTED;
+        }
+        else if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
+            /* If global state is MPIDI_RMA_LOCK_ALL_CALLED, this must
+               the first time to create this target, set its access state
+               to MPIDI_RMA_LOCK_CALLED. */
+            target->access_state = MPIDI_RMA_LOCK_CALLED;
+            target->lock_type = MPI_LOCK_SHARED;
+        }
+    }
+
+    /* Enqueue operation into pending list. */
+    MPL_LL_APPEND(target->pending_op_list, target->pending_op_list_tail, op);
+    if (target->next_op_to_issue == NULL)
+        target->next_op_to_issue = op;
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
 
 /* MPIDI_CH3I_Win_target_dequeue_and_free(): dequeue and free
  * the target in RMA slots. */
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index a9ee171..e862b29 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -72,16 +72,12 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
             MPIU_ERR_POP(mpi_errno);
     }
     else {
-        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
-        MPIDI_RMA_Ops_list_t *ops_list_tail = MPIDI_CH3I_RMA_Get_ops_list_tail(win_ptr, target_rank);
         MPIDI_RMA_Op_t *new_ptr = NULL;
         MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
 
         /* queue it up */
-        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
+        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
+        MPIU_ERR_CHKANDJUMP(new_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
 
         put_pkt = &(new_ptr->pkt.put);
         MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
@@ -99,6 +95,10 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
         new_ptr->origin_datatype = origin_datatype;
         new_ptr->target_rank = target_rank;
 
+        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+
         /* if source or target datatypes are derived, increment their
          * reference counts */
         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
@@ -184,16 +184,12 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
             MPIU_ERR_POP(mpi_errno);
     }
     else {
-        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
-        MPIDI_RMA_Ops_list_t *ops_list_tail = MPIDI_CH3I_RMA_Get_ops_list_tail(win_ptr, target_rank);
         MPIDI_RMA_Op_t *new_ptr = NULL;
         MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
 
         /* queue it up */
-        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
+        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
+        MPIU_ERR_CHKANDJUMP(new_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
 
         get_pkt = &(new_ptr->pkt.get);
         MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
@@ -211,6 +207,10 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
         new_ptr->origin_datatype = origin_datatype;
         new_ptr->target_rank = target_rank;
 
+        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+
         /* if source or target datatypes are derived, increment their
          * reference counts */
         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
@@ -297,16 +297,12 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
             MPIU_ERR_POP(mpi_errno);
     }
     else {
-        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
-        MPIDI_RMA_Ops_list_t *ops_list_tail = MPIDI_CH3I_RMA_Get_ops_list_tail(win_ptr, target_rank);
         MPIDI_RMA_Op_t *new_ptr = NULL;
         MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
 
         /* queue it up */
-        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
+        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
+        MPIU_ERR_CHKANDJUMP(new_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
 
         /* If predefined and contiguous, use a simplified element */
         if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
@@ -333,6 +329,11 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                 new_ptr->origin_count = origin_count;
                 new_ptr->origin_datatype = origin_datatype;
                 new_ptr->target_rank = target_rank;
+
+                mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
+                if (mpi_errno)
+                    MPIU_ERR_POP(mpi_errno);
+
                 goto fn_exit;
             }
         }
@@ -354,6 +355,10 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
         new_ptr->origin_datatype = origin_datatype;
         new_ptr->target_rank = target_rank;
 
+        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+
         /* if source or target datatypes are derived, increment their
          * reference counts */
         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
@@ -443,15 +448,11 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
             MPIU_ERR_POP(mpi_errno);
     }
     else {
-        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
-        MPIDI_RMA_Ops_list_t *ops_list_tail = MPIDI_CH3I_RMA_Get_ops_list_tail(win_ptr, target_rank);
         MPIDI_RMA_Op_t *new_ptr = NULL;
 
         /* Append the operation to the window's RMA ops queue */
-        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
+        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
+        MPIU_ERR_CHKANDJUMP(new_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
 
         /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */
 
@@ -494,6 +495,10 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
             new_ptr->target_rank = target_rank;
         }
 
+        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+
         /* if source or target datatypes are derived, increment their
          * reference counts */
         if (op != MPI_NO_OP && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
@@ -578,16 +583,12 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
             MPIU_ERR_POP(mpi_errno);
     }
     else {
-        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
-        MPIDI_RMA_Ops_list_t *ops_list_tail = MPIDI_CH3I_RMA_Get_ops_list_tail(win_ptr, target_rank);
         MPIDI_RMA_Op_t *new_ptr = NULL;
         MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL;
 
         /* Append this operation to the RMA ops queue */
-        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
+        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
+        MPIU_ERR_CHKANDJUMP(new_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
 
         cas_pkt = &(new_ptr->pkt.cas);
         MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS);
@@ -605,6 +606,10 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         new_ptr->compare_addr = (void *) compare_addr;
         new_ptr->compare_datatype = datatype;
         new_ptr->target_rank = target_rank;
+
+        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
     }
 
   fn_exit:
@@ -673,16 +678,12 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             MPIU_ERR_POP(mpi_errno);
     }
     else {
-        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
-        MPIDI_RMA_Ops_list_t *ops_list_tail = MPIDI_CH3I_RMA_Get_ops_list_tail(win_ptr, target_rank);
         MPIDI_RMA_Op_t *new_ptr = NULL;
         MPIDI_CH3_Pkt_fop_t *fop_pkt = NULL;
 
         /* Append this operation to the RMA ops queue */
-        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(win_ptr, ops_list, ops_list_tail, &new_ptr);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
+        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
+        MPIU_ERR_CHKANDJUMP(new_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
 
         fop_pkt = &(new_ptr->pkt.fop);
         MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);
@@ -699,6 +700,10 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
         new_ptr->result_addr = result_addr;
         new_ptr->result_datatype = datatype;
         new_ptr->target_rank = target_rank;
+
+        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, new_ptr);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
     }
 
   fn_exit:

http://git.mpich.org/mpich.git/commitdiff/0f596c489a435d175af333f024195ca661b9b2d1

commit 0f596c489a435d175af333f024195ca661b9b2d1
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Oct 27 00:17:47 2014 -0500

    Add RMA slots and related APIs.
    
    We allocate a fixed size of targets array on window
    during window creation. The size can be configured
    by the user via CVAR. Each slot entry contains a list
    of target elements.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 7a3cba5..12853c5 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -142,6 +142,104 @@ static inline int MPIDI_CH3I_Win_target_free(MPID_Win * win_ptr, MPIDI_RMA_Targe
     return mpi_errno;
 }
 
+/* MPIDI_CH3I_Win_create_target(): given a rank, create
+ * corresponding target in RMA slots. */
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Win_create_target
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Win_create_target(MPID_Win * win_ptr, int target_rank,
+                                               MPIDI_RMA_Target_t **e)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_RMA_Slot_t *slot = NULL;
+    MPIDI_RMA_Target_t *t = NULL;
+
+    if (win_ptr->num_slots < win_ptr->comm_ptr->local_size)
+        slot = &(win_ptr->slots[target_rank % win_ptr->num_slots]);
+    else
+        slot = &(win_ptr->slots[target_rank]);
+
+    t = MPIDI_CH3I_Win_target_alloc(win_ptr);
+    MPIU_ERR_CHKANDJUMP(t == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
+
+    t->target_rank = target_rank;
+
+    /* Enqueue target into target list. */
+    MPL_LL_APPEND(slot->target_list, slot->target_list_tail, t);
+
+    assert(t != NULL);
+
+    (*e) = t;
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+/* MPIDI_CH3I_Win_find_target(): given a rank, find
+ * corresponding target in RMA slots. */
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Win_find_target
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Win_find_target(MPID_Win * win_ptr, int target_rank,
+                                             MPIDI_RMA_Target_t **e)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_RMA_Slot_t *slot = NULL;
+    MPIDI_RMA_Target_t *t = NULL;
+
+    if (win_ptr->num_slots < win_ptr->comm_ptr->local_size)
+        slot = &(win_ptr->slots[target_rank % win_ptr->num_slots]);
+    else
+        slot = &(win_ptr->slots[target_rank]);
+
+    t = slot->target_list;
+    while (t != NULL) {
+        if (t->target_rank == target_rank)
+            break;
+    }
+
+    (*e) = t;
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
+/* MPIDI_CH3I_Win_target_dequeue_and_free(): dequeue and free
+ * the target in RMA slots. */
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Win_target_dequeue_and_free
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Win_target_dequeue_and_free(MPID_Win * win_ptr,
+                                                         MPIDI_RMA_Target_t * e)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int target_rank = e->target_rank;
+    MPIDI_RMA_Slot_t *slot;
+
+    if (win_ptr->num_slots < win_ptr->comm_ptr->local_size)
+        slot = &(win_ptr->slots[target_rank % win_ptr->num_slots]);
+    else
+        slot = &(win_ptr->slots[target_rank]);
+
+    MPL_LL_DELETE(slot->target_list, slot->target_list_tail, e);
+
+    mpi_errno = MPIDI_CH3I_Win_target_free(win_ptr, e);
+    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+ fn_exit:
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
 /* Return nonzero if the RMA operations list is empty.
  */
 #undef FUNCNAME
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index 9624f22..7f18385 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -119,6 +119,11 @@ typedef struct MPIDI_RMA_Target {
     MPIDI_RMA_Pool_type_t pool_type;
 } MPIDI_RMA_Target_t;
 
+typedef struct MPIDI_RMA_Slot {
+    struct MPIDI_RMA_Target *target_list;
+    struct MPIDI_RMA_Target *target_list_tail;
+} MPIDI_RMA_Slot_t;
+
 typedef struct MPIDI_PT_single_op {
     MPIDI_CH3_Pkt_type_t type;  /* put, get, or accum. */
     void *addr;
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index 03a905a..3826592 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -339,6 +339,8 @@ struct MPIDI_Win_target_state {
     struct MPIDI_RMA_Target *target_pool_start; /* start pointer used for freeing */\
     struct MPIDI_RMA_Target *target_pool; /* pool of targets */          \
     struct MPIDI_RMA_Target *target_pool_tail; /* tail pointer to pool of targets */\
+    struct MPIDI_RMA_Slot *slots;                                        \
+    int num_slots;                                                       \
 
 #ifdef MPIDI_CH3_WIN_DECL
 #define MPID_DEV_WIN_DECL \
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index fe3b7ad..ba39bdc 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -7,6 +7,25 @@
 #include "mpidimpl.h"
 #include "mpidrma.h"
 
+/*
+=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
+
+cvars:
+    - name        : MPIR_CVAR_CH3_RMA_SLOTS_SIZE
+      category    : CH3
+      type        : int
+      default     : 262144
+      class       : none
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_ALL_EQ
+      description : >-
+        Number of RMA slots during window creation. Each slot contains
+        a linked list of target elements. The distribution of ranks among
+        slots follows a round-robin pattern. Requires a positive value.
+
+=== END_MPI_T_CVAR_INFO_BLOCK ===
+*/
+
 
 MPIU_THREADSAFE_INIT_DECL(initRMAoptions);
 
@@ -258,7 +277,7 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     int i;
     MPID_Comm *win_comm_ptr;
     int win_target_pool_size;
-    MPIU_CHKPMEM_DECL(3);
+    MPIU_CHKPMEM_DECL(4);
     MPIDI_STATE_DECL(MPID_STATE_WIN_INIT);
 
     MPIDI_FUNC_ENTER(MPID_STATE_WIN_INIT);
@@ -349,6 +368,14 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
         MPL_LL_APPEND((*win_ptr)->target_pool, (*win_ptr)->target_pool_tail, &((*win_ptr)->target_pool_start[i]));
     }
 
+    (*win_ptr)->num_slots = MPIR_MIN(MPIR_CVAR_CH3_RMA_SLOTS_SIZE, MPIR_Comm_size(win_comm_ptr));
+    MPIU_CHKPMEM_MALLOC((*win_ptr)->slots, struct MPIDI_RMA_Slot *,
+                        sizeof(MPIDI_RMA_Slot_t) * (*win_ptr)->num_slots, mpi_errno, "RMA slots");
+    for (i = 0; i < (*win_ptr)->num_slots; i++) {
+        (*win_ptr)->slots[i].target_list = NULL;
+        (*win_ptr)->slots[i].target_list_tail = NULL;
+    }
+
     MPID_WIN_FTABLE_SET_DEFAULTS(win_ptr);
 
   fn_exit:
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index 2c71ebd..29a90ff 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -160,6 +160,7 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
     MPIU_Free((*win_ptr)->all_win_handles);
     MPIU_Free((*win_ptr)->op_pool_start);
     MPIU_Free((*win_ptr)->target_pool_start);
+    MPIU_Free((*win_ptr)->slots);
 
     /* Free the attached buffer for windows created with MPI_Win_allocate() */
     if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE ||

http://git.mpich.org/mpich.git/commitdiff/5dd8a0a4244d17cebebf7ddff4c687f87e13556d

commit 5dd8a0a4244d17cebebf7ddff4c687f87e13556d
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Oct 28 08:40:20 2014 -0500

    Add target element and global / local pools and related APIs.
    
    Here we add a data structure to store information of active target.
    The information includes operation lists, pasive lock state,
    sync state, etc.
    
    The target element is created by origin on-demand, and can
    be freed after the remote completion of all previous oeprations
    is detected. After RMA ending synchrnization calls, all
    target elements should be freed.
    
    Similiarly with operation pools, we create two-level target
    pools for target elements: one pre-window target pool and
    one global target pool.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index bc64150..7a3cba5 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -11,6 +11,7 @@
 #include "mpid_rma_types.h"
 
 extern struct MPIDI_RMA_Op *global_rma_op_pool, *global_rma_op_pool_tail, *global_rma_op_pool_start;
+extern struct MPIDI_RMA_Target *global_rma_target_pool, *global_rma_target_pool_tail, *global_rma_target_pool_start;
 
 /* MPIDI_CH3I_Win_op_alloc(): get a new op element from op pool and
  * initialize it. If we cannot get one, return NULL. */
@@ -70,6 +71,77 @@ static inline int MPIDI_CH3I_Win_op_free(MPID_Win * win_ptr, MPIDI_RMA_Op_t * e)
     return mpi_errno;
 }
 
+/* MPIDI_CH3I_Win_target_alloc(): get a target element from the target pool.
+ * If we cannot get one, return NULL. */
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Win_target_alloc
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline MPIDI_RMA_Target_t *MPIDI_CH3I_Win_target_alloc(MPID_Win * win_ptr)
+{
+    MPIDI_RMA_Target_t *e;
+
+    if (win_ptr->target_pool == NULL) {
+        /* local pool is empty, try to find something in the global pool */
+        if (global_rma_target_pool == NULL)
+            return NULL;
+        else {
+            e = global_rma_target_pool;
+            MPL_LL_DELETE(global_rma_target_pool, global_rma_target_pool_tail, e);
+        }
+    }
+    else {
+        e = win_ptr->target_pool;
+        MPL_LL_DELETE(win_ptr->target_pool, win_ptr->target_pool_tail, e);
+    }
+
+    e->read_op_list = e->read_op_list_tail = NULL;
+    e->write_op_list = e->write_op_list_tail = NULL;
+    e->dt_op_list = e->dt_op_list_tail = NULL;
+    e->pending_op_list = e->pending_op_list_tail = NULL;
+    e->next_op_to_issue = NULL;
+
+    e->target_rank = -1;
+    e->lock_type = MPIDI_RMA_LOCK_TYPE_NONE;
+    e->lock_mode = 0;
+    e->outstanding_lock = 0;
+
+    e->sync.sync_flag = MPIDI_RMA_NONE;
+    e->sync.outstanding_acks = 0;
+    e->sync.have_remote_incomplete_ops = 1; /* When I create a new target, there must be
+                                               incomplete ops until a FLUSH/UNLOCK packet
+                                               is sent. */
+    return e;
+}
+
+/* MPIDI_CH3I_Win_target_free(): put a target element back to the target pool
+ * it belongs to. */
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Win_target_free
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Win_target_free(MPID_Win * win_ptr, MPIDI_RMA_Target_t * e)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    /* We enqueue elements to the right pool, so when they get freed
+     * at window free time, they won't conflict with the global pool
+     * or other windows */
+    MPIU_Assert(e->read_op_list == NULL);
+    MPIU_Assert(e->write_op_list == NULL);
+    MPIU_Assert(e->dt_op_list == NULL);
+    MPIU_Assert(e->pending_op_list == NULL);
+
+    /* use PREPEND when return objects back to the pool
+       in order to improve cache performance */
+    if (e->pool_type == MPIDI_RMA_POOL_WIN)
+        MPL_LL_PREPEND(win_ptr->target_pool, win_ptr->target_pool_tail, e);
+    else
+        MPL_LL_PREPEND(global_rma_target_pool, global_rma_target_pool_tail, e);
+
+    return mpi_errno;
+}
+
 /* Return nonzero if the RMA operations list is empty.
  */
 #undef FUNCNAME
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index 0176c96..9624f22 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -24,6 +24,12 @@ enum MPID_Lock_state {
     MPID_LOCK_SHARED_ALL
 };
 
+enum MPIDI_RMA_Lock_type {
+    MPIDI_RMA_LOCK_TYPE_NONE = 23,
+    MPIDI_RMA_LOCK_TYPE_SHARED,
+    MPIDI_RMA_LOCK_TYPE_EXCLUSIVE
+};
+
 /*
  * RMA Declarations.  We should move these into something separate from
  * a Request.
@@ -76,6 +82,43 @@ typedef struct MPIDI_RMA_Op {
     MPIDI_RMA_Pool_type_t pool_type;
 } MPIDI_RMA_Op_t;
 
+typedef struct MPIDI_RMA_Target {
+    struct MPIDI_RMA_Op *read_op_list, *read_op_list_tail;
+    struct MPIDI_RMA_Op *write_op_list, *write_op_list_tail;
+    struct MPIDI_RMA_Op *dt_op_list, *dt_op_list_tail;
+    struct MPIDI_RMA_Op *pending_op_list, *pending_op_list_tail;
+    struct MPIDI_RMA_Op *next_op_to_issue;
+    struct MPIDI_RMA_Target *next;
+    int target_rank;
+    enum MPIDI_RMA_Lock_type lock_type; /* SHARED, EXCLUSIVE */
+    int lock_mode;              /* e.g., MODE_NO_CHECK */
+    int outstanding_lock;
+
+    /* The target structure is free to be cleaned up when all of the
+     * following conditions hold true:
+     *   - No operations are queued up (op_list == NULL)
+     *   - There are no outstanding acks (outstanding_acks == 0)
+     *   - There are no incomplete ops (have_remote_incomplete_ops == 0)
+     *   - There are no sync messages to be sent (sync_flag == NONE)
+     */
+    struct {
+        /* next synchronization flag to be sent to the target (either
+         * piggybacked or as a separate packet */
+        enum MPIDI_RMA_sync_types sync_flag;    /* UNLOCK, FLUSH or FLUSH_LOCAL */
+
+        /* packets sent out that we are expecting an ack for */
+        int outstanding_acks;
+
+        /* if we sent out any operations, but have not waited for
+         * their remote completion, this flag is set.  When the next
+         * FLUSH or UNLOCK sync flag is set, we will clear this
+         * variable. */
+        int have_remote_incomplete_ops; /* have ops that have not completed remotely */
+    } sync;
+
+    MPIDI_RMA_Pool_type_t pool_type;
+} MPIDI_RMA_Target_t;
+
 typedef struct MPIDI_PT_single_op {
     MPIDI_CH3_Pkt_type_t type;  /* put, get, or accum. */
     void *addr;
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index a16f003..03a905a 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -199,6 +199,41 @@ typedef struct MPIDI_VC * MPID_VCR;
 #   define MPIDI_REQUEST_SEQNUM
 #endif
 
+/* Here we add RMA sync types to specify types
+ * of synchronizations the origin is going to
+ * perform to the target. */
+
+/* There are four kinds of synchronizations: NONE,
+ * FLUSH_LOCAL, FLUSH, UNLOCK.
+ * (1) NONE means there is no special synchronization,
+ * origin just issues as many operations as it can,
+ * excluding the last operation which is a piggyback
+ * candidate;
+ * (2) FLUSH_LOCAL means origin wants to do a
+ * FLUSH_LOCAL sync and issues out all pending
+ * operations including the piggyback candidate;
+ * (3) FLUSH means origin wants to do a FLUSH sync
+ * and issues out all pending operations including
+ * the last op piggybacked with a FLUSH flag to
+ * detect remote completion;
+ * (4) UNLOCK means origin issues all pending operations
+ * incuding the last op piggybacked with an UNLOCK
+ * flag to release the lock on target and detect remote
+ * completion.
+ * Note that FLUSH_LOCAL is a superset of NONE, FLUSH
+ * is a superset of FLUSH_LOCAL, and UNLOCK is a superset
+ * of FLUSH.
+ */
+/* We start with an arbitrarily chosen number (42), to help with
+ * debugging when a packet type is not initialized or wrongly
+ * initialized. */
+enum MPIDI_RMA_sync_types {
+    MPIDI_RMA_SYNC_NONE = 42,
+    MPIDI_RMA_SYNC_FLUSH_LOCAL,
+    MPIDI_RMA_SYNC_FLUSH,
+    MPIDI_RMA_SYNC_UNLOCK
+};
+
 /* We start with an arbitrarily chosen number (42), to help with
  * debugging when a packet type is not initialized or wrongly
  * initialized. */
@@ -301,6 +336,9 @@ struct MPIDI_Win_target_state {
     struct MPIDI_RMA_Op *op_pool_start; /* start pointer used for freeing */\
     struct MPIDI_RMA_Op *op_pool;  /* pool of operations */              \
     struct MPIDI_RMA_Op *op_pool_tail; /* tail pointer to pool of operations. */ \
+    struct MPIDI_RMA_Target *target_pool_start; /* start pointer used for freeing */\
+    struct MPIDI_RMA_Target *target_pool; /* pool of targets */          \
+    struct MPIDI_RMA_Target *target_pool_tail; /* tail pointer to pool of targets */\
 
 #ifdef MPIDI_CH3_WIN_DECL
 #define MPID_DEV_WIN_DECL \
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index 7ab57a1..fe3b7ad 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -257,7 +257,8 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     int mpi_errno = MPI_SUCCESS;
     int i;
     MPID_Comm *win_comm_ptr;
-    MPIU_CHKPMEM_DECL(2);
+    int win_target_pool_size;
+    MPIU_CHKPMEM_DECL(3);
     MPIDI_STATE_DECL(MPID_STATE_WIN_INIT);
 
     MPIDI_FUNC_ENTER(MPID_STATE_WIN_INIT);
@@ -337,6 +338,17 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
         MPL_LL_APPEND((*win_ptr)->op_pool, (*win_ptr)->op_pool_tail, &((*win_ptr)->op_pool_start[i]));
     }
 
+    win_target_pool_size = MPIR_MIN(MPIR_CVAR_CH3_RMA_TARGET_WIN_POOL_SIZE, MPIR_Comm_size(win_comm_ptr));
+    MPIU_CHKPMEM_MALLOC((*win_ptr)->target_pool_start, struct MPIDI_RMA_Target *,
+                        sizeof(MPIDI_RMA_Target_t) * win_target_pool_size,
+                        mpi_errno, "RMA target pool");
+    (*win_ptr)->target_pool = NULL;
+    (*win_ptr)->target_pool_tail = NULL;
+    for (i = 0; i < win_target_pool_size; i++) {
+        (*win_ptr)->target_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN;
+        MPL_LL_APPEND((*win_ptr)->target_pool, (*win_ptr)->target_pool_tail, &((*win_ptr)->target_pool_start[i]));
+    }
+
     MPID_WIN_FTABLE_SET_DEFAULTS(win_ptr);
 
   fn_exit:
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index 2cb9bd9..2c71ebd 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -35,11 +35,36 @@ cvars:
         operations) that stores information about RMA operations that
         could not be issued immediatly.  Requires a positive value.
 
+    - name        : MPIR_CVAR_CH3_RMA_TARGET_WIN_POOL_SIZE
+      category    : CH3
+      type        : int
+      default     : 256
+      class       : none
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_ALL_EQ
+      description : >-
+        Size of the window-private RMA target pool (in number of
+        targets) that stores information about RMA targets that
+        could not be issued immediately.  Requires a positive value.
+
+    - name        : MPIR_CVAR_CH3_RMA_TARGET_GLOBAL_POOL_SIZE
+      category    : CH3
+      type        : int
+      default     : 16384
+      class       : none
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_ALL_EQ
+      description : >-
+        Size of the Global RMA targets pool (in number of
+        targets) that stores information about RMA targets that
+        could not be issued immediatly.  Requires a positive value.
+
 === END_MPI_T_CVAR_INFO_BLOCK ===
 */
 
 
 struct MPIDI_RMA_Op *global_rma_op_pool = NULL, *global_rma_op_pool_tail = NULL, *global_rma_op_pool_start = NULL;
+struct MPIDI_RMA_Target *global_rma_target_pool = NULL, *global_rma_target_pool_tail = NULL, *global_rma_target_pool_start = NULL;
 
 #undef FUNCNAME
 #define FUNCNAME MPIDI_RMA_init
@@ -49,7 +74,7 @@ int MPIDI_RMA_init(void)
 {
     int mpi_errno = MPI_SUCCESS;
     int i;
-    MPIU_CHKPMEM_DECL(1);
+    MPIU_CHKPMEM_DECL(2);
 
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_RMA_INIT);
 
@@ -63,6 +88,14 @@ int MPIDI_RMA_init(void)
         MPL_LL_APPEND(global_rma_op_pool, global_rma_op_pool_tail, &(global_rma_op_pool_start[i]));
     }
 
+    MPIU_CHKPMEM_MALLOC(global_rma_target_pool_start, struct MPIDI_RMA_Target *,
+                        sizeof(struct MPIDI_RMA_Target) * MPIR_CVAR_CH3_RMA_TARGET_GLOBAL_POOL_SIZE,
+                        mpi_errno, "RMA target pool");
+    for (i = 0; i < MPIR_CVAR_CH3_RMA_TARGET_GLOBAL_POOL_SIZE; i++) {
+        global_rma_target_pool_start[i].pool_type = MPIDI_RMA_POOL_GLOBAL;
+        MPL_LL_APPEND(global_rma_target_pool, global_rma_target_pool_tail, &(global_rma_target_pool_start[i]));
+    }
+
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_RMA_INIT);
     return mpi_errno;
@@ -84,6 +117,7 @@ void MPIDI_RMA_finalize(void)
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_RMA_FINALIZE);
 
     MPIU_Free(global_rma_op_pool_start);
+    MPIU_Free(global_rma_target_pool_start);
 
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_RMA_FINALIZE);
 }
@@ -125,6 +159,7 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
     MPIU_Free((*win_ptr)->disp_units);
     MPIU_Free((*win_ptr)->all_win_handles);
     MPIU_Free((*win_ptr)->op_pool_start);
+    MPIU_Free((*win_ptr)->target_pool_start);
 
     /* Free the attached buffer for windows created with MPI_Win_allocate() */
     if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE ||

-----------------------------------------------------------------------

Summary of changes:
 src/mpi/errhan/errnames.txt                        |    2 +
 .../ch3/channels/nemesis/include/mpid_nem_inline.h |    5 +-
 src/mpid/ch3/channels/nemesis/src/ch3_progress.c   |    9 +
 src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c    |    4 +
 .../ch3/channels/sock/include/mpidi_ch3_impl.h     |    2 +
 src/mpid/ch3/channels/sock/src/ch3_progress.c      |   18 +
 src/mpid/ch3/include/mpid_rma_issue.h              |  874 +++++++
 src/mpid/ch3/include/mpid_rma_oplist.h             |  578 ++++-
 src/mpid/ch3/include/mpid_rma_types.h              |   76 +-
 src/mpid/ch3/include/mpidimpl.h                    |   52 +-
 src/mpid/ch3/include/mpidpkt.h                     |  347 ++-
 src/mpid/ch3/include/mpidpre.h                     |  125 +-
 src/mpid/ch3/include/mpidrma.h                     |  217 ++-
 src/mpid/ch3/src/ch3u_handle_recv_pkt.c            |   12 +-
 src/mpid/ch3/src/ch3u_handle_recv_req.c            | 1014 +++-----
 src/mpid/ch3/src/ch3u_handle_send_req.c            |   66 +-
 src/mpid/ch3/src/ch3u_request.c                    |    3 +-
 src/mpid/ch3/src/ch3u_rma_oplist.c                 | 1199 +++++-----
 src/mpid/ch3/src/ch3u_rma_ops.c                    |  609 ++++--
 src/mpid/ch3/src/ch3u_rma_pkthandler.c             | 1479 +++++------
 src/mpid/ch3/src/ch3u_rma_reqops.c                 |  156 +-
 src/mpid/ch3/src/ch3u_rma_sync.c                   | 2651 +++++++-------------
 src/mpid/ch3/src/ch3u_win_fns.c                    |    4 +
 src/mpid/ch3/src/mpid_rma.c                        |   87 +-
 src/mpid/ch3/src/mpidi_printf.c                    |   31 -
 src/mpid/ch3/src/mpidi_rma.c                       |   73 +-
 26 files changed, 5285 insertions(+), 4408 deletions(-)
 create mode 100644 src/mpid/ch3/include/mpid_rma_issue.h


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list