[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.2a2-241-gbfac684

Service Account noreply at mpich.org
Wed Mar 4 00:55:30 CST 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  bfac68477a32dfa49e69715235807d205400bc17 (commit)
       via  04deb8809ec47e9469d246fd8319d7bb851e0405 (commit)
       via  b796fa1e74f426e7996464af8f1430403ea8dbb4 (commit)
       via  98c76f78106563b1531ce51704f91b789a2fa4d3 (commit)
       via  5132e070e0499b3a6fa2955710bef4c699d531fc (commit)
       via  7c890ab2a746d59040aaa8e73d2d620e27d01f0d (commit)
       via  002ce8c8490b9277aa77491c0a4edc06b819db41 (commit)
       via  efad963ae16890de7681f9e0dae6879927335d64 (commit)
       via  0d5146ba091121a898de4ba807c170fa93c3dc0e (commit)
       via  c9435750d06e7a26418f3d018d1dfd8a75bee36c (commit)
       via  f75eb4eb666e1cbf249ab829e357ec1fcb8d49ca (commit)
       via  d8eb8de278149d663b1ef191f10692a86fd3f834 (commit)
       via  c986b927c32b09b98b1b740882c3bbf1a9aa04bc (commit)
       via  421f4359e9acd4a9ac986850b9f1c2028117febe (commit)
       via  0641e2f192208f8c0c0049c5ff61bbd0b5d9d9bb (commit)
       via  382b04c46893fa5aea9ed6516909f24a9dfda087 (commit)
       via  d3cbeab3e73089ffb8669bf10b26a5fdeffc04b1 (commit)
       via  ca223da02146c11283eb4f8b4b37b41e5a4b8fc8 (commit)
       via  bb0e602c8ae6c5a4edd660813f7d5c84080b7618 (commit)
       via  ab8386e765337f7c7e67e4411aa42080508f0c63 (commit)
       via  1a3e661f611a69cc59ea45435abfca79a2b59029 (commit)
       via  6c81f6cd6f2a8d5a53fae581a99daaa8824678df (commit)
       via  9fa6582a0887a3242f9cee494dd95949e71abc90 (commit)
       via  a36fd9dd81c4fae3f4aaf1b8f8c897131381ec24 (commit)
       via  fd92b7bc82dbb1bbfc06a8f43d882067a37f6796 (commit)
       via  a3af53c3bc848159c5e450419516746279322827 (commit)
       via  45cdb28256eaf9dfd561c8088e4e87dbd9c7ec16 (commit)
       via  ce8bc3105907988579ed548cd02c306a9dd345e7 (commit)
       via  67b69b2a199e8d7f81e6bf63d169441f22a80c16 (commit)
       via  49dd90f4565e0c97397c25cf19444f4aefbdeab1 (commit)
       via  7899a60219495a8c6e45299facdf15121ba167ae (commit)
       via  fa7fe99923c40f216d0709f55ad841ab537e5155 (commit)
       via  2317b31de63148e3248eecac923dd5c4fa87b001 (commit)
       via  344bf9589fe22aedfa6f270d2f7054be0eff5cc4 (commit)
       via  42b5fcf179fb3ad0b4fac74b8417d8ec2ca30e5e (commit)
       via  9dbcae0c71c803d75fe00854ad4a265b91c1902d (commit)
       via  7c1a8fb119c8c903d1ecd6f45b02cf3d2688f9dc (commit)
       via  eddd8b9145e4c5411065784edad0fa6780a38742 (commit)
       via  9404e953b68ede106f2253e91301226adcbb5aa6 (commit)
       via  131e06ef98bee8437ad47570381ac3e5f5205f73 (commit)
       via  03d4c77b2148769436b8ce0b682b9b7a662baeef (commit)
       via  ee446c5c2972d7886ead5ac6e746c5d0a3a6f22d (commit)
      from  dec3ed276c5b4f5b8430a89b379dc26475d00437 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/bfac68477a32dfa49e69715235807d205400bc17

commit bfac68477a32dfa49e69715235807d205400bc17
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 19:58:45 2015 -0800

    Add a test code to test Accumulate operations working with pair type.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/test/mpi/rma/Makefile.am b/test/mpi/rma/Makefile.am
index 5c0a5e8..1c42b1c 100644
--- a/test/mpi/rma/Makefile.am
+++ b/test/mpi/rma/Makefile.am
@@ -143,7 +143,8 @@ noinst_PROGRAMS =          \
     at_complete            \
     atomic_rmw_fop         \
     atomic_rmw_cas         \
-    atomic_rmw_gacc
+    atomic_rmw_gacc	   \
+    acc-pairtype
 
 if BUILD_MPIX_TESTS
 noinst_PROGRAMS += aint
diff --git a/test/mpi/rma/acc-pairtype.c b/test/mpi/rma/acc-pairtype.c
new file mode 100644
index 0000000..5c23435
--- /dev/null
+++ b/test/mpi/rma/acc-pairtype.c
@@ -0,0 +1,87 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2015 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* This test is going to test when Accumulate operation is working
+ * with pair types. */
+
+#include "mpi.h"
+#include <stdio.h>
+
+#define DATA_SIZE 25
+
+typedef struct long_double_int {
+    long double a;
+    int b;
+} long_double_int_t;
+
+int main(int argc, char *argv[])
+{
+    MPI_Win win;
+    int errors = 0;
+    int rank, nproc, i;
+    long_double_int_t *orig_buf;
+    long_double_int_t *tar_buf;
+    MPI_Datatype vector_dtp;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    MPI_Alloc_mem(sizeof(long_double_int_t) * DATA_SIZE, MPI_INFO_NULL, &orig_buf);
+    MPI_Alloc_mem(sizeof(long_double_int_t) * DATA_SIZE, MPI_INFO_NULL, &tar_buf);
+
+    for (i = 0; i < DATA_SIZE; i++) {
+        orig_buf[i].a = 1.0;
+        orig_buf[i].b = 1;
+        tar_buf[i].a = 0;
+        tar_buf[i].b = 0;
+    }
+
+    MPI_Type_vector(5 /* count */ , 3 /* blocklength */ , 5 /* stride */ , MPI_LONG_DOUBLE_INT,
+                    &vector_dtp);
+    MPI_Type_commit(&vector_dtp);
+
+    MPI_Win_create(tar_buf, sizeof(long_double_int_t) * DATA_SIZE, sizeof(long_double_int_t),
+                   MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+    if (rank == 0) {
+        MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win);
+        MPI_Accumulate(orig_buf, 1, vector_dtp, 1, 0, 1, vector_dtp, MPI_MAXLOC, win);
+        MPI_Win_unlock(1, win);
+    }
+
+    MPI_Win_free(&win);
+
+    if (rank == 1) {
+        for (i = 0; i < DATA_SIZE; i++) {
+            if (i % 5 < 3) {
+                if (tar_buf[i].a != 1.0 || tar_buf[i].b != 1) {
+                    errors++;
+                }
+            }
+            else {
+                if (tar_buf[i].a != 0.0 || tar_buf[i].b != 0) {
+                    errors++;
+                }
+            }
+        }
+    }
+
+    MPI_Type_free(&vector_dtp);
+
+    MPI_Free_mem(orig_buf);
+    MPI_Free_mem(tar_buf);
+
+    if (rank == 1) {
+        if (errors == 0)
+            printf(" No Errors\n");
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/test/mpi/rma/testlist.in b/test/mpi/rma/testlist.in
index 72ab28f..a836908 100644
--- a/test/mpi/rma/testlist.in
+++ b/test/mpi/rma/testlist.in
@@ -132,6 +132,7 @@ atomic_rmw_fop 3
 atomic_rmw_cas 3
 atomic_rmw_gacc 3
 aint 2 strict=false
+acc-pairtype 2
 
 ## This test is not strictly correct.  This was meant to test out the
 ## case when MPI_Test is not nonblocking.  However, we ended up

http://git.mpich.org/mpich.git/commitdiff/04deb8809ec47e9469d246fd8319d7bb851e0405

commit 04deb8809ec47e9469d246fd8319d7bb851e0405
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 23:37:22 2015 -0600

    Rename predefined_type / predef_type to basic_type.
    
    In MPI standard, predefined datatype is called as basic type.
    It is better to make the name same with the standard in the
    code.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_shm.h b/src/mpid/ch3/include/mpid_rma_shm.h
index e2c108b..103e8cc 100644
--- a/src/mpid/ch3/include/mpid_rma_shm.h
+++ b/src/mpid/ch3/include/mpid_rma_shm.h
@@ -306,7 +306,7 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
     int disp_unit, shm_op = 0;
     int mpi_errno = MPI_SUCCESS;
     int i;
-    MPI_Datatype predefined_type;
+    MPI_Datatype basic_type;
     MPI_Aint stream_elem_count, stream_unit_count;
     MPI_Aint predefined_dtp_size, predefined_dtp_extent, predefined_dtp_count;
     MPI_Aint total_len, rest_len;
@@ -350,10 +350,10 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
 
     MPID_Datatype_get_ptr(origin_datatype, origin_dtp_ptr);
     MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->basic_type != MPI_DATATYPE_NULL);
-    predefined_type = origin_dtp_ptr->basic_type;
-    MPID_Datatype_get_size_macro(predefined_type, predefined_dtp_size);
+    basic_type = origin_dtp_ptr->basic_type;
+    MPID_Datatype_get_size_macro(basic_type, predefined_dtp_size);
     predefined_dtp_count = total_len / predefined_dtp_size;
-    MPID_Datatype_get_extent_macro(predefined_type, predefined_dtp_extent);
+    MPID_Datatype_get_extent_macro(basic_type, predefined_dtp_extent);
     MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0);
 
     stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
@@ -384,11 +384,11 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
         MPID_Segment_pack(seg, first, &last, packed_buf);
         MPID_Segment_free(seg);
 
-        MPID_Datatype_is_contig(predefined_type, &is_predef_contig);
+        MPID_Datatype_is_contig(basic_type, &is_predef_contig);
 
         if (!is_predef_contig) {
             void *tmpbuf = MPIU_Malloc(stream_count * predefined_dtp_extent);
-            mpi_errno = MPIR_Localcopy(tmpbuf, stream_count, predefined_type,
+            mpi_errno = MPIR_Localcopy(tmpbuf, stream_count, basic_type,
                                        packed_buf, stream_size, MPI_BYTE);
             if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
             MPIU_Free(packed_buf);
@@ -399,7 +399,7 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
             MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
         }
 
-        mpi_errno = do_accumulate_op((void*)packed_buf, stream_count, predefined_type,
+        mpi_errno = do_accumulate_op((void*)packed_buf, stream_count, basic_type,
                                      (void*)((char*)base+disp_unit*target_disp), target_count, target_datatype,
                                      stream_offset, op);
 
@@ -436,7 +436,7 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
     int disp_unit, shm_locked = 0;
     void *base = NULL;
     int i;
-    MPI_Datatype predefined_type;
+    MPI_Datatype basic_type;
     MPI_Aint stream_elem_count, stream_unit_count;
     MPI_Aint predefined_dtp_size, predefined_dtp_extent, predefined_dtp_count;
     MPI_Aint total_len, rest_len;
@@ -497,10 +497,10 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
 
     MPID_Datatype_get_ptr(origin_datatype, origin_dtp_ptr);
     MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->basic_type != MPI_DATATYPE_NULL);
-    predefined_type = origin_dtp_ptr->basic_type;
-    MPID_Datatype_get_size_macro(predefined_type, predefined_dtp_size);
+    basic_type = origin_dtp_ptr->basic_type;
+    MPID_Datatype_get_size_macro(basic_type, predefined_dtp_size);
     predefined_dtp_count = total_len / predefined_dtp_size;
-    MPID_Datatype_get_extent_macro(predefined_type, predefined_dtp_extent);
+    MPID_Datatype_get_extent_macro(basic_type, predefined_dtp_extent);
     MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0);
 
     stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
@@ -531,18 +531,18 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
         MPID_Segment_pack(seg, first, &last, packed_buf);
         MPID_Segment_free(seg);
 
-        MPID_Datatype_is_contig(predefined_type, &is_predef_contig);
+        MPID_Datatype_is_contig(basic_type, &is_predef_contig);
 
         if (!is_predef_contig) {
             void *tmpbuf = MPIU_Malloc(stream_count * predefined_dtp_extent);
-            mpi_errno = MPIR_Localcopy(tmpbuf, stream_count, predefined_type,
+            mpi_errno = MPIR_Localcopy(tmpbuf, stream_count, basic_type,
                                        packed_buf, stream_size, MPI_BYTE);
             if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
             MPIU_Free(packed_buf);
             packed_buf = tmpbuf;
         }
 
-        mpi_errno = do_accumulate_op((void*)packed_buf, stream_count, predefined_type,
+        mpi_errno = do_accumulate_op((void*)packed_buf, stream_count, basic_type,
                                      (void*)((char*)base+disp_unit*target_disp), target_count, target_datatype,
                                      stream_offset, op);
 
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 2be4bed..6e0d943 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -137,7 +137,7 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq,
     MPID_Win *win_ptr;
     MPI_Win source_win_handle = rreq->dev.source_win_handle;
     MPIDI_CH3_Pkt_flags_t flags = rreq->dev.flags;
-    MPI_Datatype predef_datatype;
+    MPI_Datatype basic_type;
     MPI_Aint predef_count, predef_dtp_size;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
 
@@ -166,20 +166,20 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq,
     MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RECV);
 
     if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype))
-        predef_datatype = rreq->dev.datatype;
+        basic_type = rreq->dev.datatype;
     else {
-        predef_datatype = rreq->dev.datatype_ptr->basic_type;
+        basic_type = rreq->dev.datatype_ptr->basic_type;
     }
-    MPIU_Assert(predef_datatype != MPI_DATATYPE_NULL);
+    MPIU_Assert(basic_type != MPI_DATATYPE_NULL);
 
-    MPID_Datatype_get_size_macro(predef_datatype, predef_dtp_size);
+    MPID_Datatype_get_size_macro(basic_type, predef_dtp_size);
     predef_count = rreq->dev.recv_data_sz / predef_dtp_size;
     MPIU_Assert(predef_count > 0);
 
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
     /* accumulate data from tmp_buf into user_buf */
-    mpi_errno = do_accumulate_op(rreq->dev.user_buf, predef_count, predef_datatype,
+    mpi_errno = do_accumulate_op(rreq->dev.user_buf, predef_count, basic_type,
                                  rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype,
                                  rreq->dev.stream_offset, rreq->dev.op);
     if (win_ptr->shm_allocated == TRUE)
@@ -230,7 +230,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     MPID_IOV iov[MPID_IOV_LIMIT];
     int iovcnt;
     int is_contig;
-    MPI_Datatype predef_datatype;
+    MPI_Datatype basic_type;
     MPI_Aint predef_count, predef_dtp_size;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);
@@ -240,13 +240,13 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
     if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype))
-        predef_datatype = rreq->dev.datatype;
+        basic_type = rreq->dev.datatype;
     else {
-        predef_datatype = rreq->dev.datatype_ptr->basic_type;
+        basic_type = rreq->dev.datatype_ptr->basic_type;
     }
-    MPIU_Assert(predef_datatype != MPI_DATATYPE_NULL);
+    MPIU_Assert(basic_type != MPI_DATATYPE_NULL);
 
-    MPID_Datatype_get_size_macro(predef_datatype, predef_dtp_size);
+    MPID_Datatype_get_size_macro(basic_type, predef_dtp_size);
     predef_count = rreq->dev.recv_data_sz / predef_dtp_size;
     MPIU_Assert(predef_count > 0);
 
@@ -298,7 +298,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     }
 
     /* accumulate data from tmp_buf into user_buf */
-    mpi_errno = do_accumulate_op(rreq->dev.user_buf, predef_count, predef_datatype,
+    mpi_errno = do_accumulate_op(rreq->dev.user_buf, predef_count, basic_type,
                                  rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype,
                                  rreq->dev.stream_offset, rreq->dev.op);
 
@@ -532,7 +532,7 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((u
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
-    MPI_Aint predef_type_extent, predef_type_size;
+    MPI_Aint basic_type_extent, basic_type_size;
     MPI_Aint total_len, rest_len, stream_elem_count;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
 
@@ -544,8 +544,8 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((u
     /* update new request to get the data */
     MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_ACCUM_RECV);
 
-    MPID_Datatype_get_size_macro(new_dtp->basic_type, predef_type_size);
-    MPID_Datatype_get_extent_macro(new_dtp->basic_type, predef_type_extent);
+    MPID_Datatype_get_size_macro(new_dtp->basic_type, basic_type_size);
+    MPID_Datatype_get_extent_macro(new_dtp->basic_type, basic_type_extent);
 
     MPIU_Assert(!MPIDI_Request_get_srbuf_flag(rreq));
     /* allocate a SRBuf for receiving stream unit */
@@ -566,9 +566,9 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((u
 
     total_len = new_dtp->size * rreq->dev.user_count;
     rest_len = total_len - rreq->dev.stream_offset;
-    stream_elem_count = MPIDI_CH3U_SRBuf_size / predef_type_extent;
+    stream_elem_count = MPIDI_CH3U_SRBuf_size / basic_type_extent;
 
-    rreq->dev.recv_data_sz = MPIR_MIN(rest_len, stream_elem_count * predef_type_size);
+    rreq->dev.recv_data_sz = MPIR_MIN(rest_len, stream_elem_count * basic_type_size);
     rreq->dev.datatype_ptr = new_dtp;
     /* this will cause the datatype to be freed when the
      * request is freed. free dtype_info here. */
@@ -579,7 +579,7 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((u
                          "**nomem %s", "MPID_Segment_alloc");
 
     MPID_Segment_init(rreq->dev.user_buf,
-                      (rreq->dev.recv_data_sz / predef_type_size),
+                      (rreq->dev.recv_data_sz / basic_type_size),
                       new_dtp->basic_type, rreq->dev.segment_ptr, 0);
     rreq->dev.segment_first = 0;
     rreq->dev.segment_size = rreq->dev.recv_data_sz;
@@ -607,7 +607,7 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
-    MPI_Aint predef_type_extent, predef_type_size;
+    MPI_Aint basic_type_extent, basic_type_size;
     MPI_Aint total_len, rest_len, stream_elem_count;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
 
@@ -619,8 +619,8 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((
     /* update new request to get the data */
     MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_GET_ACCUM_RECV);
 
-    MPID_Datatype_get_size_macro(new_dtp->basic_type, predef_type_size);
-    MPID_Datatype_get_extent_macro(new_dtp->basic_type, predef_type_extent);
+    MPID_Datatype_get_size_macro(new_dtp->basic_type, basic_type_size);
+    MPID_Datatype_get_extent_macro(new_dtp->basic_type, basic_type_extent);
 
     MPIU_Assert(!MPIDI_Request_get_srbuf_flag(rreq));
     /* allocate a SRBuf for receiving stream unit */
@@ -641,9 +641,9 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((
 
     total_len = new_dtp->size * rreq->dev.user_count;
     rest_len = total_len - rreq->dev.stream_offset;
-    stream_elem_count = MPIDI_CH3U_SRBuf_size / predef_type_extent;
+    stream_elem_count = MPIDI_CH3U_SRBuf_size / basic_type_extent;
 
-    rreq->dev.recv_data_sz = MPIR_MIN(rest_len, stream_elem_count * predef_type_size);
+    rreq->dev.recv_data_sz = MPIR_MIN(rest_len, stream_elem_count * basic_type_size);
     rreq->dev.datatype_ptr = new_dtp;
     /* this will cause the datatype to be freed when the
      * request is freed. free dtype_info here. */
@@ -654,7 +654,7 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((
                          "**nomem %s", "MPID_Segment_alloc");
 
     MPID_Segment_init(rreq->dev.user_buf,
-                      (rreq->dev.recv_data_sz / predef_type_size),
+                      (rreq->dev.recv_data_sz / basic_type_size),
                       new_dtp->basic_type, rreq->dev.segment_ptr, 0);
     rreq->dev.segment_first = 0;
     rreq->dev.segment_size = rreq->dev.recv_data_sz;
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 15704eb..2c72406 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -1505,29 +1505,29 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     else {
         MPIU_Assert(pkt->type == MPIDI_CH3_PKT_GET_ACCUM_RESP);
 
-        MPI_Datatype predef_type;
-        MPI_Aint predef_type_extent, predef_type_size;
+        MPI_Datatype basic_type;
+        MPI_Aint basic_type_extent, basic_type_size;
         MPI_Aint stream_elem_count;
         MPI_Aint total_len, rest_len;
         MPI_Aint real_stream_offset;
 
         if (MPIR_DATATYPE_IS_PREDEFINED(req->dev.datatype)) {
-            predef_type = req->dev.datatype;
+            basic_type = req->dev.datatype;
         }
         else {
             MPIU_Assert(req->dev.datatype_ptr != NULL);
-            predef_type = req->dev.datatype_ptr->basic_type;
+            basic_type = req->dev.datatype_ptr->basic_type;
         }
 
-        MPID_Datatype_get_extent_macro(predef_type, predef_type_extent);
-        MPID_Datatype_get_size_macro(predef_type, predef_type_size);
+        MPID_Datatype_get_extent_macro(basic_type, basic_type_extent);
+        MPID_Datatype_get_size_macro(basic_type, basic_type_size);
 
         total_len = type_size * req->dev.user_count;
         rest_len = total_len - req->dev.stream_offset;
-        stream_elem_count = MPIDI_CH3U_SRBuf_size / predef_type_extent;
+        stream_elem_count = MPIDI_CH3U_SRBuf_size / basic_type_extent;
 
-        req->dev.recv_data_sz = MPIR_MIN(rest_len, stream_elem_count * predef_type_size);
-        real_stream_offset = (req->dev.stream_offset / predef_type_size) * predef_type_extent;
+        req->dev.recv_data_sz = MPIR_MIN(rest_len, stream_elem_count * basic_type_size);
+        real_stream_offset = (req->dev.stream_offset / basic_type_size) * basic_type_extent;
 
         if (MPIR_DATATYPE_IS_PREDEFINED(req->dev.datatype)) {
             req->dev.user_buf = (void *) ((char *) req->dev.user_buf + real_stream_offset);

http://git.mpich.org/mpich.git/commitdiff/b796fa1e74f426e7996464af8f1430403ea8dbb4

commit b796fa1e74f426e7996464af8f1430403ea8dbb4
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 19:34:20 2015 -0800

    Modify comments about basic_type, n_builtin_elements and builtin_element_size.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/common/datatype/mpid_datatype.h b/src/mpid/common/datatype/mpid_datatype.h
index 6cf8ac0..7af878d 100644
--- a/src/mpid/common/datatype/mpid_datatype.h
+++ b/src/mpid/common/datatype/mpid_datatype.h
@@ -374,12 +374,15 @@ typedef struct MPID_Datatype {
     int is_committed;
 
     /* element information; used for accumulate and get elements
-     *
-     * if type is composed of more than one element type, then
-     * basic_type == MPI_DATATYPE_NULL and builtin_element_size == -1
+     * basic_type: describes basic type (predefined type). If the
+     *             type is composed of the same basic type, it is
+     *             set to that type, otherwise it is set to MPI_DATATYPE_NULL.
+     * n_builtin_elements: refers to the number of builtin type elements.
+     * builtin_element_size: refers to the size of builtin type. If the
+     *                       type is composed of the same builtin type,
+     *                       it is set to size of that type, otherwise it
+     *                       is set to -1.
      */
-    /* Note that here basic_type refers to predefined type, not the builtin
-       type, whereas n_builtin_elements and builtin_element_size refers to builtin type. */
     int      basic_type;
     MPI_Aint n_builtin_elements;
     MPI_Aint builtin_element_size;

http://git.mpich.org/mpich.git/commitdiff/98c76f78106563b1531ce51704f91b789a2fa4d3

commit 98c76f78106563b1531ce51704f91b789a2fa4d3
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 19:30:11 2015 -0800

    Rename eltype, n_elements and element_size to better names.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/binding/fortran/use_mpi/create_f90_util.c b/src/binding/fortran/use_mpi/create_f90_util.c
index f81c285..4050025 100644
--- a/src/binding/fortran/use_mpi/create_f90_util.c
+++ b/src/binding/fortran/use_mpi/create_f90_util.c
@@ -112,7 +112,7 @@ int MPIR_Create_unnamed_predefined( MPI_Datatype old, int combiner,
             MPI_Datatype old_basic = MPI_DATATYPE_NULL;
             MPI_Datatype new_basic = MPI_DATATYPE_NULL;
             /* we used MPID_Type_contiguous and then stomped it's contents
-             * information, so make sure that the eltype is usable by
+             * information, so make sure that the basic_type is usable by
              * MPID_Type_commit */
             MPID_Datatype_get_basic_type(old, old_basic);
             MPID_Datatype_get_basic_type(new_dtp->handle, new_basic);
diff --git a/src/mpi/datatype/get_elements_x.c b/src/mpi/datatype/get_elements_x.c
index 9c09702..8a84ca5 100644
--- a/src/mpi/datatype/get_elements_x.c
+++ b/src/mpi/datatype/get_elements_x.c
@@ -175,11 +175,11 @@ PMPI_LOCAL MPI_Count MPIR_Type_get_elements(MPI_Count *bytes_p,
     {
         return MPIR_Type_get_basic_type_elements(bytes_p, count, datatype);
     }
-    else if (datatype_ptr->element_size >= 0) {
+    else if (datatype_ptr->builtin_element_size >= 0) {
         MPI_Datatype basic_type = MPI_DATATYPE_NULL;
-        MPID_Datatype_get_basic_type(datatype_ptr->eltype, basic_type);
+        MPID_Datatype_get_basic_type(datatype_ptr->basic_type, basic_type);
         return MPIR_Type_get_basic_type_elements(bytes_p,
-                                                 count * datatype_ptr->n_elements,
+                                                 count * datatype_ptr->n_builtin_elements,
                                                  basic_type);
     }
     else {
@@ -295,7 +295,7 @@ int MPIR_Get_elements_x_impl(const MPI_Status *status, MPI_Datatype datatype, MP
      * - type with multiple element types (nastiest)
      */
     if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN ||
-        (datatype_ptr->element_size != -1 && datatype_ptr->size > 0))
+        (datatype_ptr->builtin_element_size != -1 && datatype_ptr->size > 0))
     {
         byte_count = MPIR_STATUS_GET_COUNT(*status);
 
@@ -307,7 +307,7 @@ int MPIR_Get_elements_x_impl(const MPI_Status *status, MPI_Datatype datatype, MP
          */
         if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
             MPI_Datatype basic_type = MPI_DATATYPE_NULL;
-            MPID_Datatype_get_basic_type(datatype_ptr->eltype, basic_type);
+            MPID_Datatype_get_basic_type(datatype_ptr->basic_type, basic_type);
             *elements = MPIR_Type_get_basic_type_elements(&byte_count,
                                                           -1,
                                                           basic_type);
@@ -343,7 +343,7 @@ int MPIR_Get_elements_x_impl(const MPI_Status *status, MPI_Datatype datatype, MP
         }
     }
     else /* derived type with weird element type or weird size */ {
-        MPIU_Assert(datatype_ptr->element_size == -1);
+        MPIU_Assert(datatype_ptr->builtin_element_size == -1);
 
         byte_count = MPIR_STATUS_GET_COUNT(*status);
         *elements = MPIR_Type_get_elements(&byte_count, -1, datatype);
diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index 0ea8520..9700ef6 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -76,7 +76,7 @@ static int fill_in_derived_dtp_info(MPIDI_RMA_Op_t * rma_op, MPID_Datatype * dtp
     rma_op->dtype_info.extent = dtp->extent;
     rma_op->dtype_info.dataloop_size = dtp->dataloop_size;
     rma_op->dtype_info.dataloop_depth = dtp->dataloop_depth;
-    rma_op->dtype_info.eltype = dtp->eltype;
+    rma_op->dtype_info.basic_type = dtp->basic_type;
     rma_op->dtype_info.dataloop = dtp->dataloop;
     rma_op->dtype_info.ub = dtp->ub;
     rma_op->dtype_info.lb = dtp->lb;
@@ -692,10 +692,10 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     }
     else {
         MPID_Datatype_get_ptr(rma_op->origin_datatype, origin_dtp_ptr);
-        MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->eltype != MPI_DATATYPE_NULL);
-        MPID_Datatype_get_size_macro(origin_dtp_ptr->eltype, predefined_dtp_size);
+        MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->basic_type != MPI_DATATYPE_NULL);
+        MPID_Datatype_get_size_macro(origin_dtp_ptr->basic_type, predefined_dtp_size);
         predefined_dtp_count = total_len / predefined_dtp_size;
-        MPID_Datatype_get_extent_macro(origin_dtp_ptr->eltype, predefined_dtp_extent);
+        MPID_Datatype_get_extent_macro(origin_dtp_ptr->basic_type, predefined_dtp_extent);
     }
     MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0);
 
@@ -867,10 +867,10 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     }
     else {
         MPID_Datatype_get_ptr(rma_op->origin_datatype, origin_dtp_ptr);
-        MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->eltype != MPI_DATATYPE_NULL);
-        MPID_Datatype_get_size_macro(origin_dtp_ptr->eltype, predefined_dtp_size);
+        MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->basic_type != MPI_DATATYPE_NULL);
+        MPID_Datatype_get_size_macro(origin_dtp_ptr->basic_type, predefined_dtp_size);
         predefined_dtp_count = total_len / predefined_dtp_size;
-        MPID_Datatype_get_extent_macro(origin_dtp_ptr->eltype, predefined_dtp_extent);
+        MPID_Datatype_get_extent_macro(origin_dtp_ptr->basic_type, predefined_dtp_extent);
     }
     MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0);
 
diff --git a/src/mpid/ch3/include/mpid_rma_shm.h b/src/mpid/ch3/include/mpid_rma_shm.h
index beef247..e2c108b 100644
--- a/src/mpid/ch3/include/mpid_rma_shm.h
+++ b/src/mpid/ch3/include/mpid_rma_shm.h
@@ -349,8 +349,8 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
     total_len = origin_dtp_size * origin_count;
 
     MPID_Datatype_get_ptr(origin_datatype, origin_dtp_ptr);
-    MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->eltype != MPI_DATATYPE_NULL);
-    predefined_type = origin_dtp_ptr->eltype;
+    MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->basic_type != MPI_DATATYPE_NULL);
+    predefined_type = origin_dtp_ptr->basic_type;
     MPID_Datatype_get_size_macro(predefined_type, predefined_dtp_size);
     predefined_dtp_count = total_len / predefined_dtp_size;
     MPID_Datatype_get_extent_macro(predefined_type, predefined_dtp_extent);
@@ -496,8 +496,8 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
     total_len = origin_dtp_size * origin_count;
 
     MPID_Datatype_get_ptr(origin_datatype, origin_dtp_ptr);
-    MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->eltype != MPI_DATATYPE_NULL);
-    predefined_type = origin_dtp_ptr->eltype;
+    MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->basic_type != MPI_DATATYPE_NULL);
+    predefined_type = origin_dtp_ptr->basic_type;
     MPID_Datatype_get_size_macro(predefined_type, predefined_dtp_size);
     predefined_dtp_count = total_len / predefined_dtp_size;
     MPID_Datatype_get_extent_macro(predefined_type, predefined_dtp_extent);
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index 9495271..be0e869 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -35,7 +35,7 @@ typedef struct MPIDI_RMA_dtype_info {   /* for derived datatypes */
     void *dataloop;             /* pointer needed to update pointers
                                  * within dataloop on remote side */
     int dataloop_depth;
-    int eltype;
+    int basic_type;
     MPI_Aint ub, lb, true_ub, true_lb;
     int has_sticky_ub, has_sticky_lb;
 } MPIDI_RMA_dtype_info;
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index 01d0788..cc05214 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -868,7 +868,7 @@ static inline int do_accumulate_op(void *source_buf, int source_count, MPI_Datat
 
         MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);
 
-        type = dtp->eltype;
+        type = dtp->basic_type;
         MPIU_Assert(type != MPI_DATATYPE_NULL);
 
         MPIU_Assert(type == source_dtp);
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 04d5acb..2be4bed 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -168,7 +168,7 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq,
     if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype))
         predef_datatype = rreq->dev.datatype;
     else {
-        predef_datatype = rreq->dev.datatype_ptr->eltype;
+        predef_datatype = rreq->dev.datatype_ptr->basic_type;
     }
     MPIU_Assert(predef_datatype != MPI_DATATYPE_NULL);
 
@@ -242,7 +242,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype))
         predef_datatype = rreq->dev.datatype;
     else {
-        predef_datatype = rreq->dev.datatype_ptr->eltype;
+        predef_datatype = rreq->dev.datatype_ptr->basic_type;
     }
     MPIU_Assert(predef_datatype != MPI_DATATYPE_NULL);
 
@@ -544,8 +544,8 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((u
     /* update new request to get the data */
     MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_ACCUM_RECV);
 
-    MPID_Datatype_get_size_macro(new_dtp->eltype, predef_type_size);
-    MPID_Datatype_get_extent_macro(new_dtp->eltype, predef_type_extent);
+    MPID_Datatype_get_size_macro(new_dtp->basic_type, predef_type_size);
+    MPID_Datatype_get_extent_macro(new_dtp->basic_type, predef_type_extent);
 
     MPIU_Assert(!MPIDI_Request_get_srbuf_flag(rreq));
     /* allocate a SRBuf for receiving stream unit */
@@ -580,7 +580,7 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((u
 
     MPID_Segment_init(rreq->dev.user_buf,
                       (rreq->dev.recv_data_sz / predef_type_size),
-                      new_dtp->eltype, rreq->dev.segment_ptr, 0);
+                      new_dtp->basic_type, rreq->dev.segment_ptr, 0);
     rreq->dev.segment_first = 0;
     rreq->dev.segment_size = rreq->dev.recv_data_sz;
 
@@ -619,8 +619,8 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((
     /* update new request to get the data */
     MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_GET_ACCUM_RECV);
 
-    MPID_Datatype_get_size_macro(new_dtp->eltype, predef_type_size);
-    MPID_Datatype_get_extent_macro(new_dtp->eltype, predef_type_extent);
+    MPID_Datatype_get_size_macro(new_dtp->basic_type, predef_type_size);
+    MPID_Datatype_get_extent_macro(new_dtp->basic_type, predef_type_extent);
 
     MPIU_Assert(!MPIDI_Request_get_srbuf_flag(rreq));
     /* allocate a SRBuf for receiving stream unit */
@@ -655,7 +655,7 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((
 
     MPID_Segment_init(rreq->dev.user_buf,
                       (rreq->dev.recv_data_sz / predef_type_size),
-                      new_dtp->eltype, rreq->dev.segment_ptr, 0);
+                      new_dtp->basic_type, rreq->dev.segment_ptr, 0);
     rreq->dev.segment_first = 0;
     rreq->dev.segment_size = rreq->dev.recv_data_sz;
 
@@ -911,7 +911,7 @@ static int create_derived_datatype(MPID_Request * req, MPID_Datatype ** dtp)
     new_dtp->extent = dtype_info->extent;
     new_dtp->dataloop_size = dtype_info->dataloop_size;
     new_dtp->dataloop_depth = dtype_info->dataloop_depth;
-    new_dtp->eltype = dtype_info->eltype;
+    new_dtp->basic_type = dtype_info->basic_type;
     /* set dataloop pointer */
     new_dtp->dataloop = req->dev.dataloop;
 
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 3482b8e..9611716 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -553,10 +553,10 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
             MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
         }
         else {
-            MPIU_Assert(origin_dtp->eltype != MPI_DATATYPE_NULL);
-            MPID_Datatype_get_size_macro(origin_dtp->eltype, predefined_dtp_size);
+            MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
+            MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
             predefined_dtp_count = len / predefined_dtp_size;
-            MPID_Datatype_get_extent_macro(origin_dtp->eltype, predefined_dtp_extent);
+            MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
         }
         MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                     predefined_dtp_extent > 0);
@@ -868,10 +868,10 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                 MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
             }
             else {
-                MPIU_Assert(origin_dtp->eltype != MPI_DATATYPE_NULL);
-                MPID_Datatype_get_size_macro(origin_dtp->eltype, predefined_dtp_size);
+                MPIU_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
+                MPID_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
                 predefined_dtp_count = orig_len / predefined_dtp_size;
-                MPID_Datatype_get_extent_macro(origin_dtp->eltype, predefined_dtp_extent);
+                MPID_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
             }
             MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                         predefined_dtp_extent > 0);
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index f407aec..15704eb 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -1516,7 +1516,7 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         }
         else {
             MPIU_Assert(req->dev.datatype_ptr != NULL);
-            predef_type = req->dev.datatype_ptr->eltype;
+            predef_type = req->dev.datatype_ptr->basic_type;
         }
 
         MPID_Datatype_get_extent_macro(predef_type, predef_type_extent);
diff --git a/src/mpid/common/datatype/mpid_dataloop.h b/src/mpid/common/datatype/mpid_dataloop.h
index 7469792..0bb2456 100644
--- a/src/mpid/common/datatype/mpid_dataloop.h
+++ b/src/mpid/common/datatype/mpid_dataloop.h
@@ -64,8 +64,8 @@
 #define DLOOP_Handle_get_size_macro(handle_,size_) \
     MPID_Datatype_get_size_macro(handle_,size_)
 
-#define DLOOP_Handle_get_basic_type_macro(handle_,eltype_) \
-    MPID_Datatype_get_basic_type(handle_, eltype_)
+#define DLOOP_Handle_get_basic_type_macro(handle_,basic_type_) \
+    MPID_Datatype_get_basic_type(handle_, basic_type_)
 
 #define DLOOP_Handle_get_extent_macro(handle_,extent_) \
     MPID_Datatype_get_extent_macro(handle_,extent_)
diff --git a/src/mpid/common/datatype/mpid_datatype.h b/src/mpid/common/datatype/mpid_datatype.h
index 85887d1..6cf8ac0 100644
--- a/src/mpid/common/datatype/mpid_datatype.h
+++ b/src/mpid/common/datatype/mpid_datatype.h
@@ -24,24 +24,24 @@
 
 #define MPID_Datatype_add_ref(datatype_ptr) MPIU_Object_add_ref((datatype_ptr))
 
-#define MPID_Datatype_get_basic_type(a,eltype_) do {                    \
+#define MPID_Datatype_get_basic_type(a,basic_type_) do {                    \
     void *ptr;								\
     switch (HANDLE_GET_KIND(a)) {					\
         case HANDLE_KIND_DIRECT:					\
             ptr = MPID_Datatype_direct+HANDLE_INDEX(a);			\
-            eltype_ = ((MPID_Datatype *) ptr)->eltype;			\
+            basic_type_ = ((MPID_Datatype *) ptr)->basic_type;			\
             break;							\
         case HANDLE_KIND_INDIRECT:					\
             ptr = ((MPID_Datatype *)					\
 		   MPIU_Handle_get_ptr_indirect(a,&MPID_Datatype_mem));	\
-            eltype_ = ((MPID_Datatype *) ptr)->eltype;			\
+            basic_type_ = ((MPID_Datatype *) ptr)->basic_type;			\
             break;							\
         case HANDLE_KIND_BUILTIN:					\
-            eltype_ = a;						\
+            basic_type_ = a;						\
             break;							\
         case HANDLE_KIND_INVALID:					\
         default:							\
-	    eltype_ = 0;						\
+	    basic_type_ = 0;						\
 	    break;							\
  									\
     }									\
@@ -49,8 +49,8 @@
      * a builtin type, it must be a pair type composed of different     \
      * builtin types, so we return MPI_DATATYPE_NULL here.              \
      */                                                                 \
-    if (HANDLE_GET_KIND(eltype_) != HANDLE_KIND_BUILTIN)                \
-        eltype_ = MPI_DATATYPE_NULL;                                    \
+    if (HANDLE_GET_KIND(basic_type_) != HANDLE_KIND_BUILTIN)                \
+        basic_type_ = MPI_DATATYPE_NULL;                                    \
  } while(0)
 
 /* MPID_Datatype_release decrements the reference count on the MPID_Datatype
@@ -376,13 +376,13 @@ typedef struct MPID_Datatype {
     /* element information; used for accumulate and get elements
      *
      * if type is composed of more than one element type, then
-     * eltype == MPI_DATATYPE_NULL and element_size == -1
+     * basic_type == MPI_DATATYPE_NULL and builtin_element_size == -1
      */
-    /* Note that here eltype refers to predefined type, not the builtin
-       type, whereas n_elements and element_size refers to builtin type. */
-    int      eltype;
-    MPI_Aint n_elements;
-    MPI_Aint element_size;
+    /* Note that here basic_type refers to predefined type, not the builtin
+       type, whereas n_builtin_elements and builtin_element_size refers to builtin type. */
+    int      basic_type;
+    MPI_Aint n_builtin_elements;
+    MPI_Aint builtin_element_size;
 
     /* information on contiguity of type, for processing shortcuts.
      *
diff --git a/src/mpid/common/datatype/mpid_type_blockindexed.c b/src/mpid/common/datatype/mpid_type_blockindexed.c
index f90f81f..a3caf24 100644
--- a/src/mpid/common/datatype/mpid_type_blockindexed.c
+++ b/src/mpid/common/datatype/mpid_type_blockindexed.c
@@ -97,9 +97,9 @@ int MPID_Type_blockindexed(int count,
 	new_dtp->has_sticky_ub = 0;
 
 	new_dtp->alignsize    = el_sz; /* ??? */
-	new_dtp->n_elements   = count * blocklength;
-	new_dtp->element_size = el_sz;
-	new_dtp->eltype       = el_type;
+	new_dtp->n_builtin_elements   = count * blocklength;
+	new_dtp->builtin_element_size = el_sz;
+	new_dtp->basic_type       = el_type;
 
 	new_dtp->max_contig_blocks = count;
     }
@@ -109,8 +109,8 @@ int MPID_Type_blockindexed(int count,
 	MPID_Datatype *old_dtp;
 
 	MPID_Datatype_get_ptr(oldtype, old_dtp);
-	el_sz   = old_dtp->element_size;
-	el_type = old_dtp->eltype;
+	el_sz   = old_dtp->builtin_element_size;
+	el_type = old_dtp->basic_type;
 
 	old_lb        = old_dtp->lb;
 	old_true_lb   = old_dtp->true_lb;
@@ -126,9 +126,9 @@ int MPID_Type_blockindexed(int count,
 	new_dtp->has_sticky_ub  = old_dtp->has_sticky_ub;
 
 	new_dtp->alignsize    = old_dtp->alignsize;
-	new_dtp->n_elements   = count * blocklength * old_dtp->n_elements;
-	new_dtp->element_size = el_sz;
-	new_dtp->eltype       = el_type;
+	new_dtp->n_builtin_elements   = count * blocklength * old_dtp->n_builtin_elements;
+	new_dtp->builtin_element_size = el_sz;
+	new_dtp->basic_type       = el_type;
 
 	new_dtp->max_contig_blocks = old_dtp->max_contig_blocks * count * blocklength;
     }
diff --git a/src/mpid/common/datatype/mpid_type_contiguous.c b/src/mpid/common/datatype/mpid_type_contiguous.c
index 565eb72..2aa7c39 100644
--- a/src/mpid/common/datatype/mpid_type_contiguous.c
+++ b/src/mpid/common/datatype/mpid_type_contiguous.c
@@ -82,9 +82,9 @@ int MPID_Type_contiguous(int count,
 	new_dtp->extent        = new_dtp->ub - new_dtp->lb;
 
 	new_dtp->alignsize     = el_sz;
-	new_dtp->n_elements    = count;
-	new_dtp->element_size  = el_sz;
-        new_dtp->eltype        = el_type;
+	new_dtp->n_builtin_elements    = count;
+	new_dtp->builtin_element_size  = el_sz;
+        new_dtp->basic_type        = el_type;
 	new_dtp->is_contig     = 1;
         new_dtp->max_contig_blocks = 1;
 
@@ -95,8 +95,8 @@ int MPID_Type_contiguous(int count,
 	MPID_Datatype *old_dtp;
 
 	MPID_Datatype_get_ptr(oldtype, old_dtp);
-	el_sz   = old_dtp->element_size;
-	el_type = old_dtp->eltype;
+	el_sz   = old_dtp->builtin_element_size;
+	el_type = old_dtp->basic_type;
 
 	new_dtp->size           = count * old_dtp->size;
 	new_dtp->has_sticky_ub  = old_dtp->has_sticky_ub;
@@ -117,9 +117,9 @@ int MPID_Type_contiguous(int count,
 	new_dtp->extent  = new_dtp->ub - new_dtp->lb;
 
 	new_dtp->alignsize    = old_dtp->alignsize;
-	new_dtp->n_elements   = count * old_dtp->n_elements;
-	new_dtp->element_size = old_dtp->element_size;
-        new_dtp->eltype       = el_type;
+	new_dtp->n_builtin_elements   = count * old_dtp->n_builtin_elements;
+	new_dtp->builtin_element_size = old_dtp->builtin_element_size;
+        new_dtp->basic_type       = el_type;
 
 	new_dtp->is_contig    = old_dtp->is_contig;
         if(old_dtp->is_contig)
diff --git a/src/mpid/common/datatype/mpid_type_create_pairtype.c b/src/mpid/common/datatype/mpid_type_create_pairtype.c
index 28ea16f..2db382c 100644
--- a/src/mpid/common/datatype/mpid_type_create_pairtype.c
+++ b/src/mpid/common/datatype/mpid_type_create_pairtype.c
@@ -119,9 +119,9 @@ int MPID_Type_create_pairtype(MPI_Datatype type,
 	    /* --END ERROR HANDLING-- */
     }
 
-    new_dtp->n_elements      = 2;
-    new_dtp->element_size    = el_size;
-    new_dtp->eltype          = type;
+    new_dtp->n_builtin_elements      = 2;
+    new_dtp->builtin_element_size    = el_size;
+    new_dtp->basic_type          = type;
 
     new_dtp->has_sticky_lb   = 0;
     new_dtp->true_lb         = 0;
diff --git a/src/mpid/common/datatype/mpid_type_create_resized.c b/src/mpid/common/datatype/mpid_type_create_resized.c
index 75b535f..05945ae 100644
--- a/src/mpid/common/datatype/mpid_type_create_resized.c
+++ b/src/mpid/common/datatype/mpid_type_create_resized.c
@@ -56,10 +56,10 @@ int MPID_Type_create_resized(MPI_Datatype oldtype,
 	new_dtp->ub             = lb + extent;
 	new_dtp->extent         = extent;
 	new_dtp->alignsize      = oldsize; /* FIXME ??? */
-	new_dtp->n_elements     = 1;
-	new_dtp->element_size   = oldsize;
+	new_dtp->n_builtin_elements     = 1;
+	new_dtp->builtin_element_size   = oldsize;
 	new_dtp->is_contig      = (extent == oldsize) ? 1 : 0;
-        new_dtp->eltype         = oldtype;
+        new_dtp->basic_type         = oldtype;
 	new_dtp->max_contig_blocks = 3;  /* lb, data, ub */
     }
     else
@@ -79,9 +79,9 @@ int MPID_Type_create_resized(MPI_Datatype oldtype,
 	new_dtp->ub             = lb + extent;
 	new_dtp->extent         = extent;
 	new_dtp->alignsize      = old_dtp->alignsize;
-	new_dtp->n_elements     = old_dtp->n_elements;
-	new_dtp->element_size   = old_dtp->element_size;
-        new_dtp->eltype         = old_dtp->eltype;
+	new_dtp->n_builtin_elements     = old_dtp->n_builtin_elements;
+	new_dtp->builtin_element_size   = old_dtp->builtin_element_size;
+        new_dtp->basic_type         = old_dtp->basic_type;
 
 	new_dtp->is_contig      =
 	    (extent == old_dtp->size) ? old_dtp->is_contig : 0;
diff --git a/src/mpid/common/datatype/mpid_type_debug.c b/src/mpid/common/datatype/mpid_type_debug.c
index 25a81ed..6c607be 100644
--- a/src/mpid/common/datatype/mpid_type_debug.c
+++ b/src/mpid/common/datatype/mpid_type_debug.c
@@ -474,9 +474,9 @@ void MPIDU_Datatype_debug(MPI_Datatype type,
 		    (MPI_Aint) dtp->ub,
 		    (dtp->has_sticky_ub) ? "(sticky)" : "",
 		    (MPI_Aint) dtp->extent,
-		    (MPI_Aint) dtp->element_size,
-		    dtp->element_size == -1 ? "multiple types" :
-		    MPIDU_Datatype_builtin_to_string(dtp->eltype),
+		    (MPI_Aint) dtp->builtin_element_size,
+		    dtp->builtin_element_size == -1 ? "multiple types" :
+		    MPIDU_Datatype_builtin_to_string(dtp->basic_type),
 		    dtp->is_contig ? "is N contig" : "is not N contig"));
 
     MPIU_DBG_OUT(DATATYPE,"# Contents:");
diff --git a/src/mpid/common/datatype/mpid_type_dup.c b/src/mpid/common/datatype/mpid_type_dup.c
index 2fdae83..ae24dad 100644
--- a/src/mpid/common/datatype/mpid_type_dup.c
+++ b/src/mpid/common/datatype/mpid_type_dup.c
@@ -72,9 +72,9 @@ int MPID_Type_dup(MPI_Datatype oldtype,
 	new_dtp->cache_id      = -1;   /* ??? */
 	new_dtp->name[0]       = 0;    /* The Object name is not copied on
 					  a dup */
-	new_dtp->n_elements    = old_dtp->n_elements;
-	new_dtp->element_size  = old_dtp->element_size;
-	new_dtp->eltype        = old_dtp->eltype;
+	new_dtp->n_builtin_elements    = old_dtp->n_builtin_elements;
+	new_dtp->builtin_element_size  = old_dtp->builtin_element_size;
+	new_dtp->basic_type        = old_dtp->basic_type;
 	
 	new_dtp->dataloop       = NULL;
 	new_dtp->dataloop_size  = old_dtp->dataloop_size;
diff --git a/src/mpid/common/datatype/mpid_type_indexed.c b/src/mpid/common/datatype/mpid_type_indexed.c
index f64acea..6674335 100644
--- a/src/mpid/common/datatype/mpid_type_indexed.c
+++ b/src/mpid/common/datatype/mpid_type_indexed.c
@@ -111,8 +111,8 @@ int MPID_Type_indexed(int count,
 	new_dtp->has_sticky_lb = 0;
 
         MPIU_Assign_trunc(new_dtp->alignsize, el_sz, MPI_Aint);
-	new_dtp->element_size = el_sz;
-	new_dtp->eltype       = el_type;
+	new_dtp->builtin_element_size = el_sz;
+	new_dtp->basic_type       = el_type;
 
 	new_dtp->max_contig_blocks = count;
     }
@@ -123,13 +123,13 @@ int MPID_Type_indexed(int count,
 
 	MPID_Datatype_get_ptr(oldtype, old_dtp);
 
-	/* Ensure that "element_size" fits into an int datatype. */
-	MPID_Ensure_Aint_fits_in_int(old_dtp->element_size);
+	/* Ensure that "builtin_element_size" fits into an int datatype. */
+	MPID_Ensure_Aint_fits_in_int(old_dtp->builtin_element_size);
 
-	el_sz   = old_dtp->element_size;
+	el_sz   = old_dtp->builtin_element_size;
 	old_sz  = old_dtp->size;
-	el_ct   = old_dtp->n_elements;
-	el_type = old_dtp->eltype;
+	el_ct   = old_dtp->n_builtin_elements;
+	el_type = old_dtp->basic_type;
 
 	old_lb        = old_dtp->lb;
 	old_true_lb   = old_dtp->true_lb;
@@ -140,8 +140,8 @@ int MPID_Type_indexed(int count,
 
 	new_dtp->has_sticky_lb = old_dtp->has_sticky_lb;
 	new_dtp->has_sticky_ub = old_dtp->has_sticky_ub;
-	new_dtp->element_size  = (MPI_Aint) el_sz;
-	new_dtp->eltype        = el_type;
+	new_dtp->builtin_element_size  = (MPI_Aint) el_sz;
+	new_dtp->basic_type        = el_type;
 
         new_dtp->max_contig_blocks = 0;
         for(i=0; i<count; i++)
@@ -207,7 +207,7 @@ int MPID_Type_indexed(int count,
     new_dtp->true_ub = max_ub + (old_true_ub - old_ub);
     new_dtp->extent  = max_ub - min_lb;
 
-    new_dtp->n_elements = old_ct * el_ct;
+    new_dtp->n_builtin_elements = old_ct * el_ct;
 
     /* new type is only contig for N types if it's all one big
      * block, its size and extent are the same, and the old type
diff --git a/src/mpid/common/datatype/mpid_type_struct.c b/src/mpid/common/datatype/mpid_type_struct.c
index 72e23fd..057959c 100644
--- a/src/mpid/common/datatype/mpid_type_struct.c
+++ b/src/mpid/common/datatype/mpid_type_struct.c
@@ -248,11 +248,11 @@ int MPID_Type_struct(int count,
 	{
 	    MPID_Datatype_get_ptr(oldtype_array[i], old_dtp);
 
-	    /* Ensure that "element_size" fits into an int datatype. */
-	    MPID_Ensure_Aint_fits_in_int(old_dtp->element_size);
+	    /* Ensure that "builtin_element_size" fits into an int datatype. */
+	    MPID_Ensure_Aint_fits_in_int(old_dtp->builtin_element_size);
 
-	    tmp_el_sz   = old_dtp->element_size;
-	    tmp_el_type = old_dtp->eltype;
+	    tmp_el_sz   = old_dtp->builtin_element_size;
+	    tmp_el_type = old_dtp->basic_type;
 
 	    MPID_DATATYPE_BLOCK_LB_UB((MPI_Aint) blocklength_array[i],
 				      displacement_array[i],
@@ -386,9 +386,9 @@ int MPID_Type_struct(int count,
 	}
     }
 
-    new_dtp->n_elements = -1; /* TODO */
-    new_dtp->element_size = el_sz;
-    new_dtp->eltype = el_type;
+    new_dtp->n_builtin_elements = -1; /* TODO */
+    new_dtp->builtin_element_size = el_sz;
+    new_dtp->basic_type = el_type;
 
     new_dtp->has_sticky_lb = found_sticky_lb;
     new_dtp->true_lb       = true_lb_disp;
diff --git a/src/mpid/common/datatype/mpid_type_vector.c b/src/mpid/common/datatype/mpid_type_vector.c
index 2b3aa07..67e398f 100644
--- a/src/mpid/common/datatype/mpid_type_vector.c
+++ b/src/mpid/common/datatype/mpid_type_vector.c
@@ -93,9 +93,9 @@ int MPID_Type_vector(int count,
 	new_dtp->has_sticky_ub  = 0;
 
 	new_dtp->alignsize    = el_sz; /* ??? */
-	new_dtp->n_elements   = count * blocklength;
-	new_dtp->element_size = el_sz;
-	new_dtp->eltype       = el_type;
+	new_dtp->n_builtin_elements   = count * blocklength;
+	new_dtp->builtin_element_size = el_sz;
+	new_dtp->basic_type       = el_type;
 
 	new_dtp->max_contig_blocks = count;
 
@@ -105,8 +105,8 @@ int MPID_Type_vector(int count,
 	MPID_Datatype *old_dtp;
 
 	MPID_Datatype_get_ptr(oldtype, old_dtp);
-	el_sz   = old_dtp->element_size;
-	el_type = old_dtp->eltype;
+	el_sz   = old_dtp->builtin_element_size;
+	el_type = old_dtp->basic_type;
 
 	old_lb        = old_dtp->lb;
 	old_true_lb   = old_dtp->true_lb;
@@ -121,9 +121,9 @@ int MPID_Type_vector(int count,
 	new_dtp->has_sticky_ub  = old_dtp->has_sticky_ub;
 
 	new_dtp->alignsize    = old_dtp->alignsize;
-	new_dtp->n_elements   = count * blocklength * old_dtp->n_elements;
-	new_dtp->element_size = el_sz;
-	new_dtp->eltype       = el_type;
+	new_dtp->n_builtin_elements   = count * blocklength * old_dtp->n_builtin_elements;
+	new_dtp->builtin_element_size = el_sz;
+	new_dtp->basic_type       = el_type;
 
 	new_dtp->max_contig_blocks = old_dtp->max_contig_blocks * count * blocklength;
 
diff --git a/src/mpid/common/datatype/mpid_type_zerolen.c b/src/mpid/common/datatype/mpid_type_zerolen.c
index 509268f..630eb46 100644
--- a/src/mpid/common/datatype/mpid_type_zerolen.c
+++ b/src/mpid/common/datatype/mpid_type_zerolen.c
@@ -68,9 +68,9 @@ int MPID_Type_zerolen(MPI_Datatype *newtype)
     new_dtp->extent        = 0;
     
     new_dtp->alignsize     = 0;
-    new_dtp->element_size  = 0;
-    new_dtp->eltype        = 0;
-    new_dtp->n_elements    = 0;
+    new_dtp->builtin_element_size  = 0;
+    new_dtp->basic_type        = 0;
+    new_dtp->n_builtin_elements    = 0;
     new_dtp->is_contig     = 1;
 
     *newtype = new_dtp->handle;
diff --git a/src/mpid/common/hcoll/hcoll_rte.c b/src/mpid/common/hcoll/hcoll_rte.c
index b6a3ea7..7dc1c3b 100644
--- a/src/mpid/common/hcoll/hcoll_rte.c
+++ b/src/mpid/common/hcoll/hcoll_rte.c
@@ -103,7 +103,7 @@ static inline int count_total_dte_repeat_entries(struct dte_data_representation_
     struct dte_generalized_iovec_t *dte_iovec = data->rep.general_rep->data_representation.data;
     int total_entries_number = 0;
     for (i = 0; i < dte_iovec->repeat_count; i++) {
-        total_entries_number += dte_iovec->repeat[i].n_elements;
+        total_entries_number += dte_iovec->repeat[i].n_builtin_elements;
     }
     return total_entries_number;
 }
@@ -161,7 +161,7 @@ static int recv_nb(struct dte_data_representation_t data,
         repeat = data.rep.general_rep->data_representation.data->repeat;
         repeat_count = data.rep.general_rep->data_representation.data->repeat_count;
         for (i = 0; i < repeat_count; i++) {
-            for (j = 0; j < repeat[i].n_elements; j++) {
+            for (j = 0; j < repeat[i].n_builtin_elements; j++) {
                 char *repeat_unit = (char *) &repeat[i];
                 buf = (void *) (repeat_unit + repeat[i].elements[j].base_offset);
                 len = repeat[i].elements[j].packed_size;
@@ -228,7 +228,7 @@ static int send_nb(dte_data_representation_t data,
         repeat = data.rep.general_rep->data_representation.data->repeat;
         repeat_count = data.rep.general_rep->data_representation.data->repeat_count;
         for (i = 0; i < repeat_count; i++) {
-            for (j = 0; j < repeat[i].n_elements; j++) {
+            for (j = 0; j < repeat[i].n_builtin_elements; j++) {
                 char *repeat_unit = (char *) &repeat[i];
                 buf = (void *) (repeat_unit + repeat[i].elements[j].base_offset);
                 len = repeat[i].elements[j].packed_size;

http://git.mpich.org/mpich.git/commitdiff/5132e070e0499b3a6fa2955710bef4c699d531fc

commit 5132e070e0499b3a6fa2955710bef4c699d531fc
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Feb 24 11:25:03 2015 -0800

    Correct the usage of req's segment_first and segment_size in sendNonContig
    
    The implementations of sendNoncontig for intra-node communication in
    Nemesis and inter-node communication in network modules (except for
    TCP and SCIF) assume that req->dev.segment_first is zero and
    req->dev.segment_size is the size of data, which is not always true.
    If we stream an RMA operation and issue partial of derived data,
    req->dev.segment_first specifies the current starting location of the data
    and req->dev.segment_size specifies the current ending location of the data.
    Also, the data size should be (req->dev.segment_size - req->dev.segment_first).
    This patch corrects this issue in Nemesis and network modules.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/include/mpid_nem_inline.h b/src/mpid/ch3/channels/nemesis/include/mpid_nem_inline.h
index 4d2dadd..9601eac 100644
--- a/src/mpid/ch3/channels/nemesis/include/mpid_nem_inline.h
+++ b/src/mpid/ch3/channels/nemesis/include/mpid_nem_inline.h
@@ -431,7 +431,6 @@ MPID_nem_mpich_send_seg_header (MPID_Segment *segment, MPIDI_msg_sz_t *segment_f
 
     MPIU_Assert(vc_ch->is_local); /* netmods will have their own implementation */
     MPIU_Assert(header_sz <= sizeof(MPIDI_CH3_Pkt_t));
-    MPIU_Assert(*segment_first == 0); /* this routine is only called for new messages */
     
     
     DO_PAPI (PAPI_reset (PAPI_EventSet));
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
index 45c2006..0813a47 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c
@@ -772,10 +772,9 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
     MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG_CORE);
     MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_SENDNONCONTIG_CORE);
 
-    MPIU_Assert(sreq->dev.segment_first == 0);
     last = sreq->dev.segment_size;      /* segment_size is byte offset */
     if (last > 0) {
-        REQ_FIELD(sreq, lmt_pack_buf) = MPIU_Malloc((size_t) sreq->dev.segment_size);
+        REQ_FIELD(sreq, lmt_pack_buf) = MPIU_Malloc((size_t) (sreq->dev.segment_size - sreq->dev.segment_first));
         MPIU_ERR_CHKANDJUMP(!REQ_FIELD(sreq, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER,
                             "**outofmemory");
         MPID_Segment_pack(sreq->dev.segment_ptr, sreq->dev.segment_first, &last,
@@ -792,7 +791,7 @@ static int MPID_nem_ib_SendNoncontig_core(MPIDI_VC_t * vc, MPID_Request * sreq,
             || (((MPIDI_CH3_Pkt_t *) hdr)->type == MPIDI_CH3_PKT_ACCUMULATE))) {
 	/* If request length is too long, create LMT packet */
 	if ( MPID_NEM_IB_NETMOD_HDR_SIZEOF(vc_ib->ibcom->local_ringbuf_type)
-               + sizeof(MPIDI_CH3_Pkt_t) + sreq->dev.segment_size
+               + sizeof(MPIDI_CH3_Pkt_t) + sreq->dev.segment_size - sreq->dev.segment_first
                  > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_netmod_trailer_t)) {
             pkt_netmod.type = MPIDI_NEM_PKT_NETMOD;
 
diff --git a/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_send.c b/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_send.c
index 69f3adc..fd52108 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/mxm/mxm_send.c
@@ -168,7 +168,7 @@ int MPID_nem_mxm_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
     _dbg_mxm_output(5,
                     "SendNoncontig ========> Sending ADI msg (to=%d type=%d) for req %p (data_size %d, %d) \n",
                     vc->pg_rank, sreq->dev.pending_pkt.type, sreq, sizeof(MPIDI_CH3_Pkt_t),
-                    sreq->dev.segment_size);
+                    sreq->dev.segment_size-sreq->dev.segment_first);
 
     vc_area = VC_BASE(vc);
     req_area = REQ_BASE(sreq);
@@ -179,17 +179,16 @@ int MPID_nem_mxm_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr,
     req_area->iov_buf[0].ptr = (void *) &(sreq->dev.pending_pkt);
     req_area->iov_buf[0].length = sizeof(MPIDI_CH3_Pkt_t);
 
-    MPIU_Assert(sreq->dev.segment_first == 0);
     last = sreq->dev.segment_size;
     if (last > 0) {
-        sreq->dev.tmpbuf = MPIU_Malloc((size_t) sreq->dev.segment_size);
+        sreq->dev.tmpbuf = MPIU_Malloc((size_t) (sreq->dev.segment_size - sreq->dev.segment_first));
         MPIU_Assert(sreq->dev.tmpbuf);
         MPID_Segment_pack(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, sreq->dev.tmpbuf);
         MPIU_Assert(last == sreq->dev.segment_size);
 
         req_area->iov_count = 2;
         req_area->iov_buf[1].ptr = sreq->dev.tmpbuf;
-        req_area->iov_buf[1].length = last;
+        req_area->iov_buf[1].length = last - sreq->dev.segment_first;
     }
 
     vc_area->pending_sends += 1;
diff --git a/src/mpid/ch3/channels/nemesis/netmod/newmad/newmad_send.c b/src/mpid/ch3/channels/nemesis/netmod/newmad/newmad_send.c
index 8c83d50..23ee225 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/newmad/newmad_send.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/newmad/newmad_send.c
@@ -127,6 +127,7 @@ int MPID_nem_newmad_SendNoncontig(MPIDI_VC_t *vc, MPID_Request *sreq, void *head
     struct iovec   newmad_iov[2];
     int            num_iov = 1;
     MPIDI_msg_sz_t last;
+    MPIDI_msg_sz_t data_sz;
 
     /*
     struct iovec  *newmad_iov = (struct iovec *)MPIU_Malloc(NMAD_IOV_MAX_DEPTH*sizeof(struct iovec));
@@ -147,11 +148,11 @@ int MPID_nem_newmad_SendNoncontig(MPIDI_VC_t *vc, MPID_Request *sreq, void *head
     newmad_iov[0].iov_base = (char *)&(sreq->dev.pending_pkt);
     newmad_iov[0].iov_len  = sizeof(MPIDI_CH3_Pkt_t);
 
-    MPIU_Assert(sreq->dev.segment_first == 0);
+    data_sz = sreq->dev.segment_size - sreq->dev.segment_first;
     last = sreq->dev.segment_size;
-    if (last > 0)
+    if (data_sz > 0)
     {
-	sreq->dev.tmpbuf = MPIU_Malloc((size_t)sreq->dev.segment_size);
+	sreq->dev.tmpbuf = MPIU_Malloc((size_t) data_sz);
         REQ_FIELD(sreq,deltmpbuf) = TMP_DEL_VALUE;
         MPID_Segment_pack(sreq->dev.segment_ptr,sreq->dev.segment_first, &last,(char *)(sreq->dev.tmpbuf));
 	MPIU_Assert(last == sreq->dev.segment_size);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/ofi/ofi_msg.c b/src/mpid/ch3/channels/nemesis/netmod/ofi/ofi_msg.c
index 49a6277..2a67e31 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/ofi/ofi_msg.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/ofi/ofi_msg.c
@@ -190,17 +190,19 @@ int MPID_nem_ofi_SendNoncontig(MPIDI_VC_t * vc,
     MPI_Aint data_sz;
     uint64_t match_bits;
     MPID_Request *cts_req;
+    MPIDI_msg_sz_t first, last;
 
     BEGIN_FUNC(FCNAME);
     MPIU_Assert(hdr_sz <= (MPIDI_msg_sz_t) sizeof(MPIDI_CH3_Pkt_t));
-    MPIU_Assert(sreq->dev.segment_first == 0);
 
-    data_sz = sreq->dev.segment_size;
+    first = sreq->dev.segment_first;
+    last = sreq->dev.segment_size;
+    data_sz = sreq->dev.segment_size - sreq->dev.segment_first;
     pkt_len = sizeof(MPIDI_CH3_Pkt_t) + data_sz;
     pack_buffer = MPIU_Malloc(pkt_len);
     MPIU_Assert(pack_buffer);
     MPIU_Memcpy(pack_buffer, hdr, hdr_sz);
-    MPID_Segment_pack(sreq->dev.segment_ptr, 0, &data_sz, pack_buffer + sizeof(MPIDI_CH3_Pkt_t));
+    MPID_Segment_pack(sreq->dev.segment_ptr, first, &last, pack_buffer + sizeof(MPIDI_CH3_Pkt_t));
     START_COMM();
     MPID_nem_ofi_poll(MPID_NONBLOCKING_POLL);
     END_FUNC_RC(FCNAME);
diff --git a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_nm.c b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_nm.c
index 0564ac0..fcc5e01 100644
--- a/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_nm.c
+++ b/src/mpid/ch3/channels/nemesis/netmod/portals4/ptl_nm.c
@@ -205,15 +205,14 @@ static int send_noncontig_pkt(MPIDI_VC_t *vc, MPID_Request *sreq, void *hdr_p)
     MPID_nem_ptl_vc_area *const vc_ptl = VC_PTL(vc);
     int ret;
     char *sendbuf;
-    const size_t sent_sz = sreq->dev.segment_size < PAYLOAD_SIZE ? sreq->dev.segment_size : PAYLOAD_SIZE;
+    const size_t data_sz = sreq->dev.segment_size - sreq->dev.segment_first;
+    const size_t sent_sz = data_sz < PAYLOAD_SIZE ? data_sz : PAYLOAD_SIZE;
     const size_t sendbuf_sz = SENDBUF_SIZE(sent_sz);
-    const size_t remaining = sreq->dev.segment_size - sent_sz;
+    const size_t remaining = data_sz - sent_sz;
     ptl_match_bits_t match_bits = NPTL_MATCH(CTL_TAG, 0, MPIDI_Process.my_pg_rank);
     MPIDI_STATE_DECL(MPID_STATE_SEND_NONCONTIG_PKT);
     MPIDI_FUNC_ENTER(MPID_STATE_SEND_NONCONTIG_PKT);
 
-    MPIU_Assert(sreq->dev.segment_first == 0);
-
     sendbuf = MPIU_Malloc(sendbuf_sz);
     MPIU_Assert(sendbuf != NULL);
     MPIU_Memcpy(sendbuf, hdr_p, sizeof(MPIDI_CH3_Pkt_t));
@@ -221,15 +220,16 @@ static int send_noncontig_pkt(MPIDI_VC_t *vc, MPID_Request *sreq, void *hdr_p)
     REQ_PTL(sreq)->num_gets = 0;
     REQ_PTL(sreq)->put_done = 0;
 
-    if (sreq->dev.segment_size) {
-        MPIDI_msg_sz_t last = sent_sz;
-        MPID_Segment_pack(sreq->dev.segment_ptr, 0, &last, sendbuf + sizeof(MPIDI_CH3_Pkt_t));
+    if (data_sz) {
+        MPIDI_msg_sz_t first = sreq->dev.segment_first;
+        MPIDI_msg_sz_t last = sreq->dev.segment_first + sent_sz;
+        MPID_Segment_pack(sreq->dev.segment_ptr, first, &last, sendbuf + sizeof(MPIDI_CH3_Pkt_t));
 
         if (remaining) {  /* Post MEs for the remote gets */
             TMPBUF(sreq) = MPIU_Malloc(remaining);
-            sreq->dev.segment_first = last;
+            first = last;
             last = sreq->dev.segment_size;
-            MPID_Segment_pack(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, TMPBUF(sreq));
+            MPID_Segment_pack(sreq->dev.segment_ptr, first, &last, TMPBUF(sreq));
             MPIU_Assert(last == sreq->dev.segment_size);
 
             mpi_errno = meappend_large(vc_ptl->id, sreq, NPTL_MATCH(GET_TAG, 0, MPIDI_Process.my_pg_rank), TMPBUF(sreq), remaining);

http://git.mpich.org/mpich.git/commitdiff/7c890ab2a746d59040aaa8e73d2d620e27d01f0d

commit 7c890ab2a746d59040aaa8e73d2d620e27d01f0d
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 05:16:26 2015 -0600

    Modify SHM ACC/GACC to avoid allocate large buffer.
    
    The original implementation of ACC/GACC on SHM first
    allocates a temporary buffer which has the same data
    layout as the target data, copies the entire origin
    data to that temporary buffer, and then performs the
    ACC computation between the temporary buffer and the
    target buffer. The temporary buffer can use potentially
    large amount of memory.
    
    This patch fixes this issue as follows: (1) SHM ACC/GACC
    routines directly call do_accumulate_op() function, which
    requires the origin data to be in a 'packed manner';
    (2) if the origin data is basic type, we directly perform
    do_accumulate_op() between origin buffer and target buffer;
    if the origin data is derived, we stream the origin data
    by copying partial of origin data into a packed streaming
    buffer and performing do_accumulate_op() between the
    streaming buffer and target buffer each time.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_shm.h b/src/mpid/ch3/include/mpid_rma_shm.h
index f1ef81f..beef247 100644
--- a/src/mpid/ch3/include/mpid_rma_shm.h
+++ b/src/mpid/ch3/include/mpid_rma_shm.h
@@ -10,6 +10,13 @@
 #include "mpl_utlist.h"
 #include "mpid_rma_types.h"
 
+/* define ACC stream size as the SRBuf size */
+#define MPIDI_CH3U_Acc_stream_size MPIDI_CH3U_SRBuf_size
+
+static inline int do_accumulate_op(void *source_buf, int source_count, MPI_Datatype source_dtp,
+                                   void *target_buf, int target_count, MPI_Datatype target_dtp,
+                                   MPI_Aint stream_offset, MPI_Op acc_op);
+
 #define ASSIGN_COPY(src, dest, count, type)     \
     {                                           \
         type *src_ = (type *) src;              \
@@ -297,10 +304,14 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
 {
     void *base = NULL;
     int disp_unit, shm_op = 0;
-    MPI_User_function *uop = NULL;
-    MPID_Datatype *dtp;
     int mpi_errno = MPI_SUCCESS;
-    MPIU_CHKLMEM_DECL(2);
+    int i;
+    MPI_Datatype predefined_type;
+    MPI_Aint stream_elem_count, stream_unit_count;
+    MPI_Aint predefined_dtp_size, predefined_dtp_extent, predefined_dtp_count;
+    MPI_Aint total_len, rest_len;
+    MPI_Aint origin_dtp_size;
+    MPID_Datatype *origin_dtp_ptr = NULL;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);
@@ -317,156 +328,95 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
         disp_unit = win_ptr->disp_unit;
     }
 
-    if (op == MPI_REPLACE) {
-        if (shm_op)
+    if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
+        if (shm_op) {
             MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-        mpi_errno = shm_copy(origin_addr, origin_count, origin_datatype,
-                             (char *) base + disp_unit * target_disp, target_count,
-                             target_datatype);
-        if (shm_op)
+        }
+        mpi_errno = do_accumulate_op((void*)origin_addr, origin_count, origin_datatype,
+                                     (void*)((char *)base+disp_unit*target_disp), target_count, target_datatype,
+                                     0, op);
+        if (shm_op) {
             MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
         }
+
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
         goto fn_exit;
     }
 
-    MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN),
-                         mpi_errno, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", op);
+    /* Get total length of origin data */
+    MPID_Datatype_get_size_macro(origin_datatype, origin_dtp_size);
+    total_len = origin_dtp_size * origin_count;
 
-    /* get the function by indexing into the op table */
-    uop = MPIR_OP_HDL_TO_FN(op);
+    MPID_Datatype_get_ptr(origin_datatype, origin_dtp_ptr);
+    MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->eltype != MPI_DATATYPE_NULL);
+    predefined_type = origin_dtp_ptr->eltype;
+    MPID_Datatype_get_size_macro(predefined_type, predefined_dtp_size);
+    predefined_dtp_count = total_len / predefined_dtp_size;
+    MPID_Datatype_get_extent_macro(predefined_type, predefined_dtp_extent);
+    MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0);
 
-    if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
-        MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
-        /* Cast away const'ness for origin_address in order to
-         * avoid changing the prototype for MPI_User_function */
-        if (shm_op)
-            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-        (*uop) ((void *) origin_addr, (char *) base + disp_unit * target_disp,
-                &target_count, &target_datatype);
-        if (shm_op)
-            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-    }
-    else {
-        /* derived datatype */
+    stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
+    stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
+    MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);
 
-        MPID_Segment *segp;
-        DLOOP_VECTOR *dloop_vec;
+    rest_len = total_len;
+    for (i = 0; i < stream_unit_count; i++) {
+        MPID_Segment *seg = NULL;
+        void *packed_buf = NULL;
         MPI_Aint first, last;
-        int vec_len, i, type_size, count;
-        MPI_Aint type_extent;
-        MPI_Datatype type;
-        MPI_Aint true_lb, true_extent, extent;
-        void *tmp_buf = NULL, *target_buf;
-        const void *source_buf;
-        MPI_Aint curr_len;
-        void *curr_loc;
-
-        if (origin_datatype != target_datatype) {
-            /* first copy the data into a temporary buffer with
-             * the same datatype as the target. Then do the
-             * accumulate operation. */
-
-            MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent);
-            MPID_Datatype_get_extent_macro(target_datatype, extent);
-
-            MPIU_CHKLMEM_MALLOC(tmp_buf, void *,
-                                target_count * (MPIR_MAX(extent, true_extent)),
-                                mpi_errno, "temporary buffer");
-            /* adjust for potential negative lower bound in datatype */
-            tmp_buf = (void *) ((char *) tmp_buf - true_lb);
-
-            mpi_errno = MPIR_Localcopy(origin_addr, origin_count,
-                                       origin_datatype, tmp_buf, target_count, target_datatype);
-            if (mpi_errno) {
-                MPIU_ERR_POP(mpi_errno);
-            }
+        int is_predef_contig;
+        MPI_Aint stream_offset, stream_size, stream_count;
+
+        stream_offset = i * stream_elem_count * predefined_dtp_size;
+        stream_size = MPIR_MIN(stream_elem_count * predefined_dtp_size, rest_len);
+        stream_count = stream_size / predefined_dtp_size;
+        rest_len -= stream_size;
+
+        first = stream_offset;
+        last = stream_offset + stream_size;
+
+        packed_buf = MPIU_Malloc(stream_size);
+
+        seg = MPID_Segment_alloc();
+        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment");
+        MPID_Segment_init(origin_addr, origin_count, origin_datatype, seg, 0);
+        MPID_Segment_pack(seg, first, &last, packed_buf);
+        MPID_Segment_free(seg);
+
+        MPID_Datatype_is_contig(predefined_type, &is_predef_contig);
+
+        if (!is_predef_contig) {
+            void *tmpbuf = MPIU_Malloc(stream_count * predefined_dtp_extent);
+            mpi_errno = MPIR_Localcopy(tmpbuf, stream_count, predefined_type,
+                                       packed_buf, stream_size, MPI_BYTE);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            MPIU_Free(packed_buf);
+            packed_buf = tmpbuf;
         }
 
-        if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
-            /* target predefined type, origin derived datatype */
-
-            if (shm_op)
-                MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-            (*uop) (tmp_buf, (char *) base + disp_unit * target_disp,
-                    &target_count, &target_datatype);
-            if (shm_op)
-                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+        if (shm_op) {
+            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
         }
-        else {
-
-            segp = MPID_Segment_alloc();
-            MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER,
-                                 "**nomem", "**nomem %s", "MPID_Segment_alloc");
-            MPID_Segment_init(NULL, target_count, target_datatype, segp, 0);
-            first = 0;
-            last = SEGMENT_IGNORE_LAST;
-
-            MPID_Datatype_get_ptr(target_datatype, dtp);
-            vec_len = dtp->max_contig_blocks * target_count + 1;
-            /* +1 needed because Rob says so */
-            MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *,
-                                vec_len * sizeof(DLOOP_VECTOR), mpi_errno, "dloop vector");
-
-            MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);
-
-            source_buf = (tmp_buf != NULL) ? (const void *) tmp_buf : origin_addr;
-            target_buf = (char *) base + disp_unit * target_disp;
-            type = dtp->eltype;
-
-            MPIU_Assert(type != MPI_DATATYPE_NULL);
-
-            MPID_Datatype_get_size_macro(type, type_size);
-            MPID_Datatype_get_extent_macro(type, type_extent);
-
-            if (shm_op)
-                MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-
-            i = 0;
-            curr_loc = dloop_vec[0].DLOOP_VECTOR_BUF;
-            curr_len = dloop_vec[0].DLOOP_VECTOR_LEN;
-            while (i != vec_len) {
-                if (curr_len < type_size) {
-                    MPIU_Assert(i != vec_len);
-                    i++;
-                    curr_len += dloop_vec[i].DLOOP_VECTOR_LEN;
-                    continue;
-                }
-
-                MPIU_Assign_trunc(count, curr_len/type_size, int);
-                (*uop)((char *)source_buf + MPIU_PtrToAint(curr_loc),
-                       (char *)target_buf + MPIU_PtrToAint(curr_loc),
-                       &count, &type);
-
-                if (curr_len % type_size == 0) {
-                    i++;
-                    if (i != vec_len) {
-                        curr_loc = dloop_vec[i].DLOOP_VECTOR_BUF;
-                        curr_len = dloop_vec[i].DLOOP_VECTOR_LEN;
-                    }
-                }
-                else {
-                    curr_loc = (void *)((char *)curr_loc + type_extent * count);
-                    curr_len -= type_size * count;
-                }
-            }
-
-            if (shm_op)
-                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-
-            MPID_Segment_free(segp);
+
+        mpi_errno = do_accumulate_op((void*)packed_buf, stream_count, predefined_type,
+                                     (void*)((char*)base+disp_unit*target_disp), target_count, target_datatype,
+                                     stream_offset, op);
+
+        if (shm_op) {
+            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
         }
+
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        MPIU_Free(packed_buf);
     }
 
   fn_exit:
-    MPIU_CHKLMEM_FREEALL();
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
-    if (shm_op)
-        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -485,10 +435,14 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
 {
     int disp_unit, shm_locked = 0;
     void *base = NULL;
-    MPI_User_function *uop = NULL;
-    MPID_Datatype *dtp;
+    int i;
+    MPI_Datatype predefined_type;
+    MPI_Aint stream_elem_count, stream_unit_count;
+    MPI_Aint predefined_dtp_size, predefined_dtp_extent, predefined_dtp_count;
+    MPI_Aint total_len, rest_len;
+    MPI_Aint origin_dtp_size;
+    MPID_Datatype *origin_dtp_ptr = NULL;
     int mpi_errno = MPI_SUCCESS;
-    MPIU_CHKLMEM_DECL(2);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);
@@ -523,135 +477,78 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
         goto fn_exit;
     }
 
-    if (op == MPI_REPLACE) {
-        mpi_errno = shm_copy(origin_addr, origin_count, origin_datatype,
-                             (char *) base + disp_unit * target_disp, target_count,
-                             target_datatype);
-
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
-        }
+    if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
 
+        mpi_errno = do_accumulate_op((void*)origin_addr, origin_count, origin_datatype,
+                                     (void*)((char *)base+disp_unit*target_disp), target_count, target_datatype,
+                                     0, op);
         if (shm_locked) {
             MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-            shm_locked = 0;
         }
 
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
         goto fn_exit;
     }
 
-    MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN),
-                         mpi_errno, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", op);
+    /* Get total length of origin data */
+    MPID_Datatype_get_size_macro(origin_datatype, origin_dtp_size);
+    total_len = origin_dtp_size * origin_count;
 
-    /* get the function by indexing into the op table */
-    uop = MPIR_OP_HDL_TO_FN(op);
+    MPID_Datatype_get_ptr(origin_datatype, origin_dtp_ptr);
+    MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->eltype != MPI_DATATYPE_NULL);
+    predefined_type = origin_dtp_ptr->eltype;
+    MPID_Datatype_get_size_macro(predefined_type, predefined_dtp_size);
+    predefined_dtp_count = total_len / predefined_dtp_size;
+    MPID_Datatype_get_extent_macro(predefined_type, predefined_dtp_extent);
+    MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0);
 
-    if ((op == MPI_NO_OP || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) &&
-        MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
-        /* Cast away const'ness for origin_address in order to
-         * avoid changing the prototype for MPI_User_function */
-        (*uop) ((void *) origin_addr, (char *) base + disp_unit * target_disp,
-                &target_count, &target_datatype);
-    }
-    else {
-        /* derived datatype */
+    stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
+    stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
+    MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);
 
-        MPID_Segment *segp;
-        DLOOP_VECTOR *dloop_vec;
+    rest_len = total_len;
+    for (i = 0; i < stream_unit_count; i++) {
+        MPID_Segment *seg = NULL;
+        void *packed_buf = NULL;
         MPI_Aint first, last;
-        int vec_len, i, type_size, count;
-        MPI_Datatype type;
-        MPI_Aint true_lb, true_extent, extent;
-        void *tmp_buf = NULL, *target_buf;
-        const void *source_buf;
-        MPI_Aint type_extent;
-        MPI_Aint curr_len;
-        void *curr_loc;
-
-        if (origin_datatype != target_datatype) {
-            /* first copy the data into a temporary buffer with
-             * the same datatype as the target. Then do the
-             * accumulate operation. */
-
-            MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent);
-            MPID_Datatype_get_extent_macro(target_datatype, extent);
-
-            MPIU_CHKLMEM_MALLOC(tmp_buf, void *,
-                                target_count * (MPIR_MAX(extent, true_extent)),
-                                mpi_errno, "temporary buffer");
-            /* adjust for potential negative lower bound in datatype */
-            tmp_buf = (void *) ((char *) tmp_buf - true_lb);
-
-            mpi_errno = MPIR_Localcopy(origin_addr, origin_count,
-                                       origin_datatype, tmp_buf, target_count, target_datatype);
-            if (mpi_errno) {
-                MPIU_ERR_POP(mpi_errno);
-            }
+        int is_predef_contig;
+        MPI_Aint stream_offset, stream_size, stream_count;
+
+        stream_offset = i * stream_elem_count * predefined_dtp_size;
+        stream_size = MPIR_MIN(stream_elem_count * predefined_dtp_size, rest_len);
+        stream_count = stream_size / predefined_dtp_size;
+        rest_len -= stream_size;
+
+        first = stream_offset;
+        last = stream_offset + stream_size;
+
+        packed_buf = MPIU_Malloc(stream_size);
+
+        seg = MPID_Segment_alloc();
+        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment");
+        MPID_Segment_init(origin_addr, origin_count, origin_datatype, seg, 0);
+        MPID_Segment_pack(seg, first, &last, packed_buf);
+        MPID_Segment_free(seg);
+
+        MPID_Datatype_is_contig(predefined_type, &is_predef_contig);
+
+        if (!is_predef_contig) {
+            void *tmpbuf = MPIU_Malloc(stream_count * predefined_dtp_extent);
+            mpi_errno = MPIR_Localcopy(tmpbuf, stream_count, predefined_type,
+                                       packed_buf, stream_size, MPI_BYTE);
+            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            MPIU_Free(packed_buf);
+            packed_buf = tmpbuf;
         }
 
-        if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
-            /* target predefined type, origin derived datatype */
+        mpi_errno = do_accumulate_op((void*)packed_buf, stream_count, predefined_type,
+                                     (void*)((char*)base+disp_unit*target_disp), target_count, target_datatype,
+                                     stream_offset, op);
 
-            (*uop) (tmp_buf, (char *) base + disp_unit * target_disp,
-                    &target_count, &target_datatype);
-        }
-        else {
-
-            segp = MPID_Segment_alloc();
-            MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER,
-                                 "**nomem", "**nomem %s", "MPID_Segment_alloc");
-            MPID_Segment_init(NULL, target_count, target_datatype, segp, 0);
-            first = 0;
-            last = SEGMENT_IGNORE_LAST;
-
-            MPID_Datatype_get_ptr(target_datatype, dtp);
-            vec_len = dtp->max_contig_blocks * target_count + 1;
-            /* +1 needed because Rob says so */
-            MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *,
-                                vec_len * sizeof(DLOOP_VECTOR), mpi_errno, "dloop vector");
-
-            MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);
-
-            source_buf = (tmp_buf != NULL) ? (const void *) tmp_buf : origin_addr;
-            target_buf = (char *) base + disp_unit * target_disp;
-            type = dtp->eltype;
-
-            MPIU_Assert(type != MPI_DATATYPE_NULL);
-
-            MPID_Datatype_get_size_macro(type, type_size);
-            MPID_Datatype_get_extent_macro(type, type_extent);
-
-            i = 0;
-            curr_loc = dloop_vec[0].DLOOP_VECTOR_BUF;
-            curr_len = dloop_vec[0].DLOOP_VECTOR_LEN;
-            while (i != vec_len) {
-                if (curr_len < type_size) {
-                    MPIU_Assert(i != vec_len);
-                    i++;
-                    curr_len += dloop_vec[i].DLOOP_VECTOR_LEN;
-                    continue;
-                }
-
-                MPIU_Assign_trunc(count, curr_len/type_size, int);
-                (*uop)((char *)source_buf + MPIU_PtrToAint(curr_loc),
-                       (char *)target_buf + MPIU_PtrToAint(curr_loc),
-                       &count, &type);
-
-                if (curr_len % type_size == 0) {
-                    i++;
-                    if (i != vec_len) {
-                        curr_loc = dloop_vec[i].DLOOP_VECTOR_BUF;
-                        curr_len = dloop_vec[i].DLOOP_VECTOR_LEN;
-                    }
-                }
-                else {
-                    curr_loc = (void *)((char *)curr_loc + type_extent * count);
-                    curr_len -= type_size * count;
-                }
-            }
-
-            MPID_Segment_free(segp);
-        }
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+
+        MPIU_Free(packed_buf);
     }
 
     if (shm_locked) {
@@ -660,7 +557,6 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
     }
 
   fn_exit:
-    MPIU_CHKLMEM_FREEALL();
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */

http://git.mpich.org/mpich.git/commitdiff/002ce8c8490b9277aa77491c0a4edc06b819db41

commit 002ce8c8490b9277aa77491c0a4edc06b819db41
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:27:34 2015 -0800

    Allocate buffer with stream size for ACC/GACC data piggybacked with LOCK.
    
    For queued ACC/GACC data piggybacked with LOCK, we do not
    need to allocate the buffer for the entire operation, but
    only need to allocate a buffer with stream unit size. This
    patch fixes this issue.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index ad7b841..3351a65 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -538,6 +538,21 @@ MPIDI_CH3_PKT_DEFS
         }                                                               \
     }
 
+#define MPIDI_CH3_PKT_RMA_GET_STREAM_OFFSET(pkt_, stream_offset_, err_) \
+    {                                                                   \
+        err_ = MPI_SUCCESS;                                             \
+        switch((pkt_).type) {                                           \
+        case (MPIDI_CH3_PKT_ACCUMULATE):                                \
+            (stream_offset_) = (pkt_).accum.info.metadata.stream_offset; \
+            break;                                                      \
+        case (MPIDI_CH3_PKT_GET_ACCUM):                                 \
+            (stream_offset_) = (pkt_).get_accum.info.metadata.stream_offset; \
+            break;                                                      \
+        default:                                                        \
+            MPIU_ERR_SETANDJUMP1(err_, MPI_ERR_OTHER, "**invalidpkt", "**invalidpkt %d", (pkt_).type); \
+        }                                                               \
+    }
+
 #define MPIDI_CH3_PKT_RMA_GET_REQUEST_HANDLE(pkt_, request_hdl_, err_)  \
     {                                                                   \
         err_ = MPI_SUCCESS;                                             \
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index bcc9afd..01d0788 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -372,8 +372,22 @@ static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
 
         MPID_Datatype_get_extent_macro(target_dtp, type_extent);
         MPID_Datatype_get_size_macro(target_dtp, type_size);
-        recv_data_sz = type_size * target_count;
-        buf_size = type_extent * target_count;
+
+        if (pkt->type == MPIDI_CH3_PKT_PUT) {
+            recv_data_sz = type_size * target_count;
+            buf_size = type_extent * target_count;
+        }
+        else {
+            MPI_Aint stream_offset, stream_elem_count;
+            MPI_Aint total_len, rest_len;
+
+            MPIDI_CH3_PKT_RMA_GET_STREAM_OFFSET((*pkt), stream_offset, mpi_errno);
+            stream_elem_count = MPIDI_CH3U_SRBuf_size / type_extent;
+            total_len = type_size * target_count;
+            rest_len = total_len - stream_offset;
+            recv_data_sz = MPIR_MIN(rest_len, type_size * stream_elem_count);
+            buf_size = type_extent * (recv_data_sz / type_size);
+        }
 
         if (new_ptr != NULL) {
             if (win_ptr->current_lock_data_bytes + buf_size < MPIR_CVAR_CH3_RMA_LOCK_DATA_BYTES) {

http://git.mpich.org/mpich.git/commitdiff/efad963ae16890de7681f9e0dae6879927335d64

commit efad963ae16890de7681f9e0dae6879927335d64
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:26:08 2015 -0800

    Modify RMA pkt handlers and req handlers to allow for stream units.
    
    On target side, we always allocate a SRBuf with 256K, which
    equals to the size of stream unit, to receive ACC/GACC data.
    
    Note that in MPIDI_CH3U_Request_load_recv_iov(), for ACC/GACC
    operations, since we already use SRBuf to receive the data
    at beginning, we will not use another SRBuf here, in order
    to avoid one more memory copy.
    
    Also, we pass the stream_offset in the current RMA packet to
    the request struct (when receiving is not finished) and
    do_accumulate_op function (when receiving is finished).
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index b6ecbbf..04d5acb 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -134,10 +134,11 @@ int MPIDI_CH3_ReqHandler_PutRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, i
 int MPIDI_CH3_ReqHandler_AccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
-    MPI_Aint true_lb, true_extent;
     MPID_Win *win_ptr;
     MPI_Win source_win_handle = rreq->dev.source_win_handle;
     MPIDI_CH3_Pkt_flags_t flags = rreq->dev.flags;
+    MPI_Datatype predef_datatype;
+    MPI_Aint predef_count, predef_dtp_size;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
@@ -164,11 +165,23 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq,
 
     MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RECV);
 
+    if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype))
+        predef_datatype = rreq->dev.datatype;
+    else {
+        predef_datatype = rreq->dev.datatype_ptr->eltype;
+    }
+    MPIU_Assert(predef_datatype != MPI_DATATYPE_NULL);
+
+    MPID_Datatype_get_size_macro(predef_datatype, predef_dtp_size);
+    predef_count = rreq->dev.recv_data_sz / predef_dtp_size;
+    MPIU_Assert(predef_count > 0);
+
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
     /* accumulate data from tmp_buf into user_buf */
-    mpi_errno = do_accumulate_op(rreq->dev.user_buf, rreq->dev.real_user_buf,
-                                 rreq->dev.user_count, rreq->dev.datatype, rreq->dev.op);
+    mpi_errno = do_accumulate_op(rreq->dev.user_buf, predef_count, predef_datatype,
+                                 rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype,
+                                 rreq->dev.stream_offset, rreq->dev.op);
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
     if (mpi_errno) {
@@ -176,8 +189,7 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq,
     }
 
     /* free the temporary buffer */
-    MPIR_Type_get_true_extent_impl(rreq->dev.datatype, &true_lb, &true_extent);
-    MPIU_Free((char *) rreq->dev.user_buf + true_lb);
+    MPIDI_CH3U_SRBuf_free(rreq);
 
     /* mark data transfer as complete and decrement CC */
     MPIDI_CH3U_Request_complete(rreq);
@@ -212,14 +224,14 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Win *win_ptr;
-    MPI_Aint type_size;
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_get_accum_resp_t *get_accum_resp_pkt = &upkt.get_accum_resp;
     MPID_Request *resp_req;
     MPID_IOV iov[MPID_IOV_LIMIT];
-    MPI_Aint true_lb, true_extent;
     int iovcnt;
     int is_contig;
+    MPI_Datatype predef_datatype;
+    MPI_Aint predef_count, predef_dtp_size;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);
 
@@ -227,6 +239,17 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
 
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
+    if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype))
+        predef_datatype = rreq->dev.datatype;
+    else {
+        predef_datatype = rreq->dev.datatype_ptr->eltype;
+    }
+    MPIU_Assert(predef_datatype != MPI_DATATYPE_NULL);
+
+    MPID_Datatype_get_size_macro(predef_datatype, predef_dtp_size);
+    predef_count = rreq->dev.recv_data_sz / predef_dtp_size;
+    MPIU_Assert(predef_count > 0);
+
     MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
     get_accum_resp_pkt->request_handle = rreq->dev.resp_request_handle;
     get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
@@ -238,8 +261,6 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
         (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
         get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
 
-    MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
-
     MPID_Datatype_is_contig(rreq->dev.datatype, &is_contig);
 
     /* Copy data into a temporary buffer */
@@ -248,19 +269,21 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     MPIU_Object_set_ref(resp_req, 1);
     MPIDI_Request_set_type(resp_req, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
 
-    MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, rreq->dev.user_count * type_size,
+    MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, rreq->dev.recv_data_sz,
                         mpi_errno, "GACC resp. buffer");
 
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 
     if (is_contig) {
-        MPIU_Memcpy(resp_req->dev.user_buf, rreq->dev.real_user_buf,
-                    rreq->dev.user_count * type_size);
+        MPIU_Memcpy(resp_req->dev.user_buf,
+                    (void *) ((char *) rreq->dev.real_user_buf + rreq->dev.stream_offset),
+                    rreq->dev.recv_data_sz);
     }
     else {
         MPID_Segment *seg = MPID_Segment_alloc();
-        MPI_Aint last = type_size * rreq->dev.user_count;
+        MPI_Aint first = rreq->dev.stream_offset;
+        MPI_Aint last = first + rreq->dev.recv_data_sz;
 
         if (seg == NULL) {
             if (win_ptr->shm_allocated == TRUE)
@@ -270,13 +293,14 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
                              "MPID_Segment");
         MPID_Segment_init(rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype, seg,
                           0);
-        MPID_Segment_pack(seg, 0, &last, resp_req->dev.user_buf);
+        MPID_Segment_pack(seg, first, &last, resp_req->dev.user_buf);
         MPID_Segment_free(seg);
     }
 
     /* accumulate data from tmp_buf into user_buf */
-    mpi_errno = do_accumulate_op(rreq->dev.user_buf, rreq->dev.real_user_buf,
-                                 rreq->dev.user_count, rreq->dev.datatype, rreq->dev.op);
+    mpi_errno = do_accumulate_op(rreq->dev.user_buf, predef_count, predef_datatype,
+                                 rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype,
+                                 rreq->dev.stream_offset, rreq->dev.op);
 
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
@@ -296,7 +320,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
     iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
     iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) resp_req->dev.user_buf);
-    iov[1].MPID_IOV_LEN = rreq->dev.user_count * type_size;
+    iov[1].MPID_IOV_LEN = rreq->dev.recv_data_sz;
     iovcnt = 2;
 
     MPIU_THREAD_CS_ENTER(CH3COMM, vc);
@@ -311,8 +335,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RECV);
 
     /* free the temporary buffer */
-    MPIR_Type_get_true_extent_impl(rreq->dev.datatype, &true_lb, &true_extent);
-    MPIU_Free((char *) rreq->dev.user_buf + true_lb);
+    MPIDI_CH3U_SRBuf_free(rreq);
 
     /* mark data transfer as complete and decrement CC */
     MPIDI_CH3U_Request_complete(rreq);
@@ -398,8 +421,9 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, i
 
     /* Perform accumulate computation */
     if (rreq->dev.op != MPI_NO_OP) {
-        mpi_errno = do_accumulate_op(rreq->dev.user_buf, rreq->dev.real_user_buf,
-                                     1, rreq->dev.datatype, rreq->dev.op);
+        mpi_errno = do_accumulate_op(rreq->dev.user_buf, 1, rreq->dev.datatype,
+                                     rreq->dev.real_user_buf, 1, rreq->dev.datatype, 0,
+                                     rreq->dev.op);
     }
 
     if (win_ptr->shm_allocated == TRUE)
@@ -508,8 +532,8 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((u
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
-    MPI_Aint true_lb, true_extent, extent;
-    void *tmp_buf;
+    MPI_Aint predef_type_extent, predef_type_size;
+    MPI_Aint total_len, rest_len, stream_elem_count;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
@@ -520,23 +544,31 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((u
     /* update new request to get the data */
     MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_ACCUM_RECV);
 
-    /* first need to allocate tmp_buf to recv the data into */
-
-    MPIR_Type_get_true_extent_impl(new_dtp->handle, &true_lb, &true_extent);
-    MPID_Datatype_get_extent_macro(new_dtp->handle, extent);
+    MPID_Datatype_get_size_macro(new_dtp->eltype, predef_type_size);
+    MPID_Datatype_get_extent_macro(new_dtp->eltype, predef_type_extent);
 
-    tmp_buf = MPIU_Malloc(rreq->dev.user_count * (MPIR_MAX(extent, true_extent)));
-    if (!tmp_buf) {
-        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                             rreq->dev.user_count * MPIR_MAX(extent, true_extent));
+    MPIU_Assert(!MPIDI_Request_get_srbuf_flag(rreq));
+    /* allocate a SRBuf for receiving stream unit */
+    MPIDI_CH3U_SRBuf_alloc(rreq, MPIDI_CH3U_SRBuf_size);
+    /* --BEGIN ERROR HANDLING-- */
+    if (rreq->dev.tmpbuf_sz == 0) {
+        MPIU_DBG_MSG(CH3_CHANNEL, TYPICAL, "SRBuf allocation failure");
+        mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL,
+                                         FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem",
+                                         "**nomem %d", MPIDI_CH3U_SRBuf_size);
+        rreq->status.MPI_ERROR = mpi_errno;
+        goto fn_fail;
     }
+    /* --END ERROR HANDLING-- */
 
-    /* adjust for potential negative lower bound in datatype */
-    tmp_buf = (void *) ((char *) tmp_buf - true_lb);
-
-    rreq->dev.user_buf = tmp_buf;
+    rreq->dev.user_buf = rreq->dev.tmpbuf;
     rreq->dev.datatype = new_dtp->handle;
-    rreq->dev.recv_data_sz = new_dtp->size * rreq->dev.user_count;
+
+    total_len = new_dtp->size * rreq->dev.user_count;
+    rest_len = total_len - rreq->dev.stream_offset;
+    stream_elem_count = MPIDI_CH3U_SRBuf_size / predef_type_extent;
+
+    rreq->dev.recv_data_sz = MPIR_MIN(rest_len, stream_elem_count * predef_type_size);
     rreq->dev.datatype_ptr = new_dtp;
     /* this will cause the datatype to be freed when the
      * request is freed. free dtype_info here. */
@@ -547,7 +579,8 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((u
                          "**nomem %s", "MPID_Segment_alloc");
 
     MPID_Segment_init(rreq->dev.user_buf,
-                      rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0);
+                      (rreq->dev.recv_data_sz / predef_type_size),
+                      new_dtp->eltype, rreq->dev.segment_ptr, 0);
     rreq->dev.segment_first = 0;
     rreq->dev.segment_size = rreq->dev.recv_data_sz;
 
@@ -574,8 +607,8 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
-    MPI_Aint true_lb, true_extent, extent;
-    void *tmp_buf;
+    MPI_Aint predef_type_extent, predef_type_size;
+    MPI_Aint total_len, rest_len, stream_elem_count;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
@@ -586,23 +619,31 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((
     /* update new request to get the data */
     MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_GET_ACCUM_RECV);
 
-    /* first need to allocate tmp_buf to recv the data into */
-
-    MPIR_Type_get_true_extent_impl(new_dtp->handle, &true_lb, &true_extent);
-    MPID_Datatype_get_extent_macro(new_dtp->handle, extent);
+    MPID_Datatype_get_size_macro(new_dtp->eltype, predef_type_size);
+    MPID_Datatype_get_extent_macro(new_dtp->eltype, predef_type_extent);
 
-    tmp_buf = MPIU_Malloc(rreq->dev.user_count * (MPIR_MAX(extent, true_extent)));
-    if (!tmp_buf) {
-        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                             rreq->dev.user_count * MPIR_MAX(extent, true_extent));
+    MPIU_Assert(!MPIDI_Request_get_srbuf_flag(rreq));
+    /* allocate a SRBuf for receiving stream unit */
+    MPIDI_CH3U_SRBuf_alloc(rreq, MPIDI_CH3U_SRBuf_size);
+    /* --BEGIN ERROR HANDLING-- */
+    if (rreq->dev.tmpbuf_sz == 0) {
+        MPIU_DBG_MSG(CH3_CHANNEL, TYPICAL, "SRBuf allocation failure");
+        mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL,
+                                         FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem",
+                                         "**nomem %d", MPIDI_CH3U_SRBuf_size);
+        rreq->status.MPI_ERROR = mpi_errno;
+        goto fn_fail;
     }
+    /* --END ERROR HANDLING-- */
 
-    /* adjust for potential negative lower bound in datatype */
-    tmp_buf = (void *) ((char *) tmp_buf - true_lb);
-
-    rreq->dev.user_buf = tmp_buf;
+    rreq->dev.user_buf = rreq->dev.tmpbuf;
     rreq->dev.datatype = new_dtp->handle;
-    rreq->dev.recv_data_sz = new_dtp->size * rreq->dev.user_count;
+
+    total_len = new_dtp->size * rreq->dev.user_count;
+    rest_len = total_len - rreq->dev.stream_offset;
+    stream_elem_count = MPIDI_CH3U_SRBuf_size / predef_type_extent;
+
+    rreq->dev.recv_data_sz = MPIR_MIN(rest_len, stream_elem_count * predef_type_size);
     rreq->dev.datatype_ptr = new_dtp;
     /* this will cause the datatype to be freed when the
      * request is freed. free dtype_info here. */
@@ -613,7 +654,8 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((
                          "**nomem %s", "MPID_Segment_alloc");
 
     MPID_Segment_init(rreq->dev.user_buf,
-                      rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0);
+                      (rreq->dev.recv_data_sz / predef_type_size),
+                      new_dtp->eltype, rreq->dev.segment_ptr, 0);
     rreq->dev.segment_first = 0;
     rreq->dev.segment_size = rreq->dev.recv_data_sz;
 
@@ -1065,14 +1107,26 @@ static inline int perform_acc_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
 
     if (acc_pkt->type == MPIDI_CH3_PKT_ACCUMULATE_IMMED) {
         /* All data fits in packet header */
-        mpi_errno = do_accumulate_op(acc_pkt->info.data, acc_pkt->addr,
-                                     acc_pkt->count, acc_pkt->datatype, acc_pkt->op);
+        mpi_errno = do_accumulate_op(acc_pkt->info.data, acc_pkt->count, acc_pkt->datatype,
+                                     acc_pkt->addr, acc_pkt->count, acc_pkt->datatype,
+                                     0, acc_pkt->op);
     }
     else {
         MPIU_Assert(acc_pkt->type == MPIDI_CH3_PKT_ACCUMULATE);
+        MPI_Aint type_size, type_extent;
+        MPI_Aint total_len, rest_len, recv_count;
+
+        MPID_Datatype_get_size_macro(acc_pkt->datatype, type_size);
+        MPID_Datatype_get_extent_macro(acc_pkt->datatype, type_extent);
 
-        mpi_errno = do_accumulate_op(lock_entry->data, acc_pkt->addr,
-                                     acc_pkt->count, acc_pkt->datatype, acc_pkt->op);
+        total_len = type_size * acc_pkt->count;
+        rest_len = total_len - acc_pkt->info.metadata.stream_offset;
+        recv_count = MPIR_MIN((rest_len / type_size), (MPIDI_CH3U_SRBuf_size / type_extent));
+        MPIU_Assert(recv_count > 0);
+
+        mpi_errno = do_accumulate_op(lock_entry->data, recv_count, acc_pkt->datatype,
+                                     acc_pkt->addr, acc_pkt->count, acc_pkt->datatype,
+                                     acc_pkt->info.metadata.stream_offset, acc_pkt->op);
     }
 
     if (win_ptr->shm_allocated == TRUE)
@@ -1106,6 +1160,8 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
     MPID_IOV iov[MPID_IOV_LIMIT];
     int is_contig;
     int mpi_errno = MPI_SUCCESS;
+    MPI_Aint type_extent;
+    MPI_Aint total_len, rest_len, recv_count;
 
     /* Piggyback candidate should have basic datatype for target datatype. */
     MPIU_Assert(MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype));
@@ -1159,9 +1215,10 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
         }
 
         /* All data fits in packet header */
-        mpi_errno = do_accumulate_op(get_accum_pkt->info.data, get_accum_pkt->addr,
-                                     get_accum_pkt->count, get_accum_pkt->datatype,
-                                     get_accum_pkt->op);
+        mpi_errno =
+            do_accumulate_op(get_accum_pkt->info.data, get_accum_pkt->count,
+                             get_accum_pkt->datatype, get_accum_pkt->addr, get_accum_pkt->count,
+                             get_accum_pkt->datatype, 0, get_accum_pkt->op);
 
         if (win_ptr->shm_allocated == TRUE)
             MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
@@ -1189,7 +1246,14 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
 
     MPIU_Assert(get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM);
 
-    sreq->dev.user_buf = (void *) MPIU_Malloc(get_accum_pkt->count * type_size);
+    MPID_Datatype_get_extent_macro(get_accum_pkt->datatype, type_extent);
+
+    total_len = type_size * get_accum_pkt->count;
+    rest_len = total_len - get_accum_pkt->info.metadata.stream_offset;
+    recv_count = MPIR_MIN((rest_len / type_size), (MPIDI_CH3U_SRBuf_size / type_extent));
+    MPIU_Assert(recv_count > 0);
+
+    sreq->dev.user_buf = (void *) MPIU_Malloc(recv_count * type_size);
 
     MPID_Datatype_is_contig(get_accum_pkt->datatype, &is_contig);
 
@@ -1198,11 +1262,14 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 
     if (is_contig) {
-        MPIU_Memcpy(sreq->dev.user_buf, get_accum_pkt->addr, get_accum_pkt->count * type_size);
+        MPIU_Memcpy(sreq->dev.user_buf,
+                    (void *) ((char *) get_accum_pkt->addr +
+                              get_accum_pkt->info.metadata.stream_offset), recv_count * type_size);
     }
     else {
         MPID_Segment *seg = MPID_Segment_alloc();
-        MPI_Aint last = type_size * get_accum_pkt->count;
+        MPI_Aint first = get_accum_pkt->info.metadata.stream_offset;
+        MPI_Aint last = first + type_size * recv_count;
 
         if (seg == NULL) {
             if (win_ptr->shm_allocated == TRUE)
@@ -1212,12 +1279,13 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
                              "MPID_Segment");
         MPID_Segment_init(get_accum_pkt->addr, get_accum_pkt->count, get_accum_pkt->datatype, seg,
                           0);
-        MPID_Segment_pack(seg, 0, &last, sreq->dev.user_buf);
+        MPID_Segment_pack(seg, first, &last, sreq->dev.user_buf);
         MPID_Segment_free(seg);
     }
 
-    mpi_errno = do_accumulate_op(lock_entry->data, get_accum_pkt->addr,
-                                 get_accum_pkt->count, get_accum_pkt->datatype, get_accum_pkt->op);
+    mpi_errno = do_accumulate_op(lock_entry->data, recv_count, get_accum_pkt->datatype,
+                                 get_accum_pkt->addr, get_accum_pkt->count, get_accum_pkt->datatype,
+                                 get_accum_pkt->info.metadata.stream_offset, get_accum_pkt->op);
 
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
@@ -1243,7 +1311,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
     iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
     iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
     iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) sreq->dev.user_buf);
-    iov[1].MPID_IOV_LEN = get_accum_pkt->count * type_size;
+    iov[1].MPID_IOV_LEN = recv_count * type_size;
     iovcnt = 2;
 
     mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, sreq, iov, iovcnt);
@@ -1355,12 +1423,12 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     /* Apply the op */
     if (fop_pkt->op != MPI_NO_OP) {
         if (fop_pkt->type == MPIDI_CH3_PKT_FOP_IMMED) {
-            mpi_errno = do_accumulate_op(fop_pkt->info.data, fop_pkt->addr,
-                                         1, fop_pkt->datatype, fop_pkt->op);
+            mpi_errno = do_accumulate_op(fop_pkt->info.data, 1, fop_pkt->datatype,
+                                         fop_pkt->addr, 1, fop_pkt->datatype, 0, fop_pkt->op);
         }
         else {
-            mpi_errno = do_accumulate_op(lock_entry->data, fop_pkt->addr,
-                                         1, fop_pkt->datatype, fop_pkt->op);
+            mpi_errno = do_accumulate_op(lock_entry->data, 1, fop_pkt->datatype,
+                                         fop_pkt->addr, 1, fop_pkt->datatype, 0, fop_pkt->op);
         }
     }
 
diff --git a/src/mpid/ch3/src/ch3u_request.c b/src/mpid/ch3/src/ch3u_request.c
index 65efbb6..6793ce8 100644
--- a/src/mpid/ch3/src/ch3u_request.c
+++ b/src/mpid/ch3/src/ch3u_request.c
@@ -317,7 +317,9 @@ int MPIDI_CH3U_Request_load_recv_iov(MPID_Request * const rreq)
     {
 	/* still reading data that needs to go into the user buffer */
 	
-	if (MPIDI_Request_get_srbuf_flag(rreq))
+	if (MPIDI_Request_get_type(rreq) != MPIDI_REQUEST_TYPE_ACCUM_RECV &&
+            MPIDI_Request_get_type(rreq) != MPIDI_REQUEST_TYPE_GET_ACCUM_RECV &&
+            MPIDI_Request_get_srbuf_flag(rreq))
 	{
 	    MPIDI_msg_sz_t data_sz;
 	    MPIDI_msg_sz_t tmpbuf_sz;
@@ -406,8 +408,10 @@ int MPIDI_CH3U_Request_load_recv_iov(MPID_Request * const rreq)
 	    /* Eventually, use OnFinal for this instead */
 	    rreq->dev.OnDataAvail = rreq->dev.OnFinal;
 	}
-	else if (last == rreq->dev.segment_size || 
-		 (last - rreq->dev.segment_first) / rreq->dev.iov_count >= MPIDI_IOV_DENSITY_MIN)
+	else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RECV ||
+                 MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RECV ||
+                 (last == rreq->dev.segment_size ||
+                  (last - rreq->dev.segment_first) / rreq->dev.iov_count >= MPIDI_IOV_DENSITY_MIN))
 	{
 	    MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,
 	     "updating rreq to read more data directly into the user buffer");
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index c281573..f407aec 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -570,8 +570,7 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 {
     MPIDI_CH3_Pkt_accum_t *accum_pkt = &pkt->accum;
     MPID_Request *req = NULL;
-    MPI_Aint true_lb, true_extent, extent;
-    void *tmp_buf = NULL;
+    MPI_Aint extent;
     int complete = 0;
     char *data_buf = NULL;
     MPIDI_msg_sz_t data_len;
@@ -579,6 +578,7 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     int acquire_lock_fail = 0;
     int mpi_errno = MPI_SUCCESS;
     MPI_Aint type_size;
+    MPI_Aint stream_elem_count, rest_len, total_len;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_ACCUMULATE);
@@ -605,8 +605,9 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
         if (win_ptr->shm_allocated == TRUE)
             MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-        mpi_errno = do_accumulate_op(accum_pkt->info.data, accum_pkt->addr,
-                                     accum_pkt->count, accum_pkt->datatype, accum_pkt->op);
+        mpi_errno = do_accumulate_op(accum_pkt->info.data, accum_pkt->count, accum_pkt->datatype,
+                                     accum_pkt->addr, accum_pkt->count, accum_pkt->datatype,
+                                     0, accum_pkt->op);
         if (win_ptr->shm_allocated == TRUE)
             MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
         if (mpi_errno) {
@@ -648,22 +649,31 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RECV);
             req->dev.datatype = accum_pkt->datatype;
 
-            MPIR_Type_get_true_extent_impl(accum_pkt->datatype, &true_lb, &true_extent);
             MPID_Datatype_get_extent_macro(accum_pkt->datatype, extent);
 
-            /* Predefined types should always have zero lb */
-            MPIU_Assert(true_lb == 0);
-
-            tmp_buf = MPIU_Malloc(accum_pkt->count * (MPIR_MAX(extent, true_extent)));
-            if (!tmp_buf) {
-                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                     accum_pkt->count * MPIR_MAX(extent, true_extent));
+            MPIU_Assert(!MPIDI_Request_get_srbuf_flag(req));
+            /* allocate a SRBuf for receiving stream unit */
+            MPIDI_CH3U_SRBuf_alloc(req, MPIDI_CH3U_SRBuf_size);
+            /* --BEGIN ERROR HANDLING-- */
+            if (req->dev.tmpbuf_sz == 0) {
+                MPIU_DBG_MSG(CH3_CHANNEL, TYPICAL, "SRBuf allocation failure");
+                mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL,
+                                                 FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem",
+                                                 "**nomem %d", MPIDI_CH3U_SRBuf_size);
+                req->status.MPI_ERROR = mpi_errno;
+                goto fn_fail;
             }
+            /* --END ERROR HANDLING-- */
 
-            req->dev.user_buf = tmp_buf;
+            req->dev.user_buf = req->dev.tmpbuf;
 
             MPID_Datatype_get_size_macro(accum_pkt->datatype, type_size);
-            req->dev.recv_data_sz = type_size * accum_pkt->count;
+
+            total_len = type_size * accum_pkt->count;
+            rest_len = total_len - req->dev.stream_offset;
+            stream_elem_count = MPIDI_CH3U_SRBuf_size / extent;
+
+            req->dev.recv_data_sz = MPIR_MIN(rest_len, stream_elem_count * type_size);
             MPIU_Assert(req->dev.recv_data_sz > 0);
 
             mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
@@ -755,8 +765,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 {
     MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &pkt->get_accum;
     MPID_Request *req = NULL;
-    MPI_Aint true_lb, true_extent, extent;
-    void *tmp_buf = NULL;
+    MPI_Aint extent;
     int complete = 0;
     char *data_buf = NULL;
     MPIDI_msg_sz_t data_len;
@@ -764,6 +773,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     int acquire_lock_fail = 0;
     int mpi_errno = MPI_SUCCESS;
     MPI_Aint type_size;
+    MPI_Aint stream_elem_count, rest_len, total_len;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PKTHANDLER_GETACCUMULATE);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PKTHANDLER_GETACCUMULATE);
@@ -841,9 +851,10 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         }
 
         /* perform accumulate operation. */
-        mpi_errno = do_accumulate_op(get_accum_pkt->info.data, get_accum_pkt->addr,
-                                     get_accum_pkt->count, get_accum_pkt->datatype,
-                                     get_accum_pkt->op);
+        mpi_errno =
+            do_accumulate_op(get_accum_pkt->info.data, get_accum_pkt->count,
+                             get_accum_pkt->datatype, get_accum_pkt->addr, get_accum_pkt->count,
+                             get_accum_pkt->datatype, 0, get_accum_pkt->op);
 
         if (win_ptr->shm_allocated == TRUE)
             MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
@@ -891,21 +902,29 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RECV);
             req->dev.datatype = get_accum_pkt->datatype;
 
-            MPIR_Type_get_true_extent_impl(get_accum_pkt->datatype, &true_lb, &true_extent);
             MPID_Datatype_get_extent_macro(get_accum_pkt->datatype, extent);
 
-            /* Predefined types should always have zero lb */
-            MPIU_Assert(true_lb == 0);
-
-            tmp_buf = MPIU_Malloc(get_accum_pkt->count * (MPIR_MAX(extent, true_extent)));
-            if (!tmp_buf) {
-                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                     get_accum_pkt->count * MPIR_MAX(extent, true_extent));
+            MPIU_Assert(!MPIDI_Request_get_srbuf_flag(req));
+            /* allocate a SRBuf for receiving stream unit */
+            MPIDI_CH3U_SRBuf_alloc(req, MPIDI_CH3U_SRBuf_size);
+            /* --BEGIN ERROR HANDLING-- */
+            if (req->dev.tmpbuf_sz == 0) {
+                MPIU_DBG_MSG(CH3_CHANNEL, TYPICAL, "SRBuf allocation failure");
+                mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL,
+                                                 FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem",
+                                                 "**nomem %d", MPIDI_CH3U_SRBuf_size);
+                req->status.MPI_ERROR = mpi_errno;
+                goto fn_fail;
             }
+            /* --END ERROR HANDLING-- */
 
-            req->dev.user_buf = tmp_buf;
+            req->dev.user_buf = req->dev.tmpbuf;
 
-            req->dev.recv_data_sz = type_size * get_accum_pkt->count;
+            total_len = type_size * get_accum_pkt->count;
+            rest_len = total_len - req->dev.stream_offset;
+            stream_elem_count = MPIDI_CH3U_SRBuf_size / extent;
+
+            req->dev.recv_data_sz = MPIR_MIN(rest_len, stream_elem_count * type_size);
             MPIU_Assert(req->dev.recv_data_sz > 0);
 
             mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
@@ -1229,8 +1248,8 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
         /* Apply the op */
         if (fop_pkt->op != MPI_NO_OP) {
-            mpi_errno = do_accumulate_op(fop_pkt->info.data, fop_pkt->addr,
-                                         1, fop_pkt->datatype, fop_pkt->op);
+            mpi_errno = do_accumulate_op(fop_pkt->info.data, 1, fop_pkt->datatype,
+                                         fop_pkt->addr, 1, fop_pkt->datatype, 0, fop_pkt->op);
         }
 
         if (win_ptr->shm_allocated == TRUE)
@@ -1472,11 +1491,12 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
     MPID_Datatype_get_size_macro(req->dev.datatype, type_size);
-    req->dev.recv_data_sz = type_size * req->dev.user_count;
 
     *rreqp = req;
 
     if (get_accum_resp_pkt->type == MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED) {
+        req->dev.recv_data_sz = type_size * req->dev.user_count;
+
         MPIU_Memcpy(req->dev.user_buf, get_accum_resp_pkt->info.data, req->dev.recv_data_sz);
         /* return the number of bytes processed in this function */
         *buflen = sizeof(MPIDI_CH3_Pkt_t);
@@ -1485,12 +1505,56 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     else {
         MPIU_Assert(pkt->type == MPIDI_CH3_PKT_GET_ACCUM_RESP);
 
-        mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
-        MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
-                             "**ch3|postrecv %s", "MPIDI_CH3_PKT_GET_ACCUM_RESP");
+        MPI_Datatype predef_type;
+        MPI_Aint predef_type_extent, predef_type_size;
+        MPI_Aint stream_elem_count;
+        MPI_Aint total_len, rest_len;
+        MPI_Aint real_stream_offset;
 
-        /* return the number of bytes processed in this function */
-        *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
+        if (MPIR_DATATYPE_IS_PREDEFINED(req->dev.datatype)) {
+            predef_type = req->dev.datatype;
+        }
+        else {
+            MPIU_Assert(req->dev.datatype_ptr != NULL);
+            predef_type = req->dev.datatype_ptr->eltype;
+        }
+
+        MPID_Datatype_get_extent_macro(predef_type, predef_type_extent);
+        MPID_Datatype_get_size_macro(predef_type, predef_type_size);
+
+        total_len = type_size * req->dev.user_count;
+        rest_len = total_len - req->dev.stream_offset;
+        stream_elem_count = MPIDI_CH3U_SRBuf_size / predef_type_extent;
+
+        req->dev.recv_data_sz = MPIR_MIN(rest_len, stream_elem_count * predef_type_size);
+        real_stream_offset = (req->dev.stream_offset / predef_type_size) * predef_type_extent;
+
+        if (MPIR_DATATYPE_IS_PREDEFINED(req->dev.datatype)) {
+            req->dev.user_buf = (void *) ((char *) req->dev.user_buf + real_stream_offset);
+            mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
+            MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                                 "**ch3|postrecv %s", "MPIDI_CH3_PKT_GET_ACCUM_RESP");
+
+            /* return the number of bytes processed in this function */
+            *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
+        }
+        else {
+            *buflen = sizeof(MPIDI_CH3_Pkt_t);
+
+            req->dev.segment_ptr = MPID_Segment_alloc();
+            MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype,
+                              req->dev.segment_ptr, 0);
+            req->dev.segment_first = req->dev.stream_offset;
+            req->dev.segment_size = req->dev.stream_offset + req->dev.recv_data_sz;
+
+            mpi_errno = MPIDI_CH3U_Request_load_recv_iov(req);
+            if (mpi_errno != MPI_SUCCESS) {
+                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|loadrecviov");
+            }
+            if (req->dev.OnDataAvail == NULL) {
+                req->dev.OnDataAvail = req->dev.OnFinal;
+            }
+        }
     }
     if (complete) {
         /* Request-based RMA defines final actions for completing user request. */

http://git.mpich.org/mpich.git/commitdiff/0d5146ba091121a898de4ba807c170fa93c3dc0e

commit 0d5146ba091121a898de4ba807c170fa93c3dc0e
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 05:09:42 2015 -0600

    Modify do_accumulate_op to allow for packed basic type data as input.
    
    Originally, do_accumulate_op() is used to perform the ACC
    computation on target between data from origin side and
    data on the target window. It requires that the target side
    must first unpack the received origin data into the same data
    layout as the target data before calling this function, which
    may consume potentially large of memory.
    
    This patch fixes do_accumulate_op() function in the following
    aspects:
    
    (1) It requires that the origin data passed to the function
    must be "in a packed manner", which means it looks as if all
    basic type elements in the origin data is placed one by one.
    Note that the origin data is not necessarily contiguous, since
    we may use non-contiguous basic type. If the basic type
    is contiguous, then the origin data must be contiguous.
    
    (2) It adds a new function argument, stream_offset, which
    specifies a starting location in the target data. This allows
    the origin data to work with partial of target data with stream
    size.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index 12ace6b..bcc9afd 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -761,39 +761,54 @@ static inline int MPIDI_CH3I_RMA_Handle_flush_ack(MPID_Win * win_ptr, int target
 #define FUNCNAME do_accumulate_op
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int do_accumulate_op(void *source_buf, void *target_buf,
-                                   int acc_count, MPI_Datatype acc_dtp, MPI_Op acc_op)
+static inline int do_accumulate_op(void *source_buf, int source_count, MPI_Datatype source_dtp,
+                                   void *target_buf, int target_count, MPI_Datatype target_dtp,
+                                   MPI_Aint stream_offset, MPI_Op acc_op)
 {
     int mpi_errno = MPI_SUCCESS;
-    MPI_User_function *uop;
+    MPI_User_function *uop = NULL;
+    MPI_Aint source_dtp_size, source_dtp_extent;
     MPIDI_STATE_DECL(MPID_STATE_DO_ACCUMULATE_OP);
 
     MPIDI_FUNC_ENTER(MPID_STATE_DO_ACCUMULATE_OP);
 
-    if (acc_op == MPI_REPLACE) {
-        /* simply copy the data */
-        mpi_errno = MPIR_Localcopy(source_buf, acc_count, acc_dtp, target_buf, acc_count, acc_dtp);
-        if (mpi_errno) {
-            MPIU_ERR_POP(mpi_errno);
+    MPIU_Assert(MPIR_DATATYPE_IS_PREDEFINED(source_dtp));
+    MPID_Datatype_get_size_macro(source_dtp, source_dtp_size);
+    MPID_Datatype_get_extent_macro(source_dtp, source_dtp_extent);
+
+    if (acc_op != MPI_REPLACE) {
+        if (HANDLE_GET_KIND(acc_op) == HANDLE_KIND_BUILTIN) {
+            /* get the function by indexing into the op table */
+            uop = MPIR_OP_HDL_TO_FN(acc_op);
+        }
+        else {
+            /* --BEGIN ERROR HANDLING-- */
+            mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+                                             FCNAME, __LINE__, MPI_ERR_OP,
+                                             "**opnotpredefined", "**opnotpredefined %d", acc_op);
+            return mpi_errno;
+            /* --END ERROR HANDLING-- */
         }
-        goto fn_exit;
     }
 
-    if (HANDLE_GET_KIND(acc_op) == HANDLE_KIND_BUILTIN) {
-        /* get the function by indexing into the op table */
-        uop = MPIR_OP_HDL_TO_FN(acc_op);
-    }
-    else {
-        /* --BEGIN ERROR HANDLING-- */
-        mpi_errno =
-            MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OP,
-                                 "**opnotpredefined", "**opnotpredefined %d", acc_op);
-        return mpi_errno;
-        /* --END ERROR HANDLING-- */
-    }
 
-    if (MPIR_DATATYPE_IS_PREDEFINED(acc_dtp)) {
-        (*uop) (source_buf, target_buf, &acc_count, &acc_dtp);
+    if (MPIR_DATATYPE_IS_PREDEFINED(target_dtp)) {
+        /* apply op if target dtp is predefined dtp */
+
+        MPIU_Assert(source_dtp == target_dtp);
+
+        MPI_Aint real_stream_offset = (stream_offset / source_dtp_size) * source_dtp_extent;
+        void *curr_target_buf = (void *) ((char *) target_buf + real_stream_offset);
+
+        if (acc_op == MPI_REPLACE) {
+            mpi_errno = MPIR_Localcopy(source_buf, source_count, source_dtp,
+                                       curr_target_buf, source_count, source_dtp);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
+        else {
+            (*uop) (source_buf, curr_target_buf, &source_count, &source_dtp);
+        }
     }
     else {
         /* derived datatype */
@@ -806,6 +821,7 @@ static inline int do_accumulate_op(void *source_buf, void *target_buf,
         MPID_Datatype *dtp;
         MPI_Aint curr_len;
         void *curr_loc;
+        int accumulated_count;
 
         segp = MPID_Segment_alloc();
         /* --BEGIN ERROR HANDLING-- */
@@ -817,12 +833,12 @@ static inline int do_accumulate_op(void *source_buf, void *target_buf,
             return mpi_errno;
         }
         /* --END ERROR HANDLING-- */
-        MPID_Segment_init(NULL, acc_count, acc_dtp, segp, 0);
-        first = 0;
-        last = SEGMENT_IGNORE_LAST;
+        MPID_Segment_init(NULL, target_count, target_dtp, segp, 0);
+        first = stream_offset;
+        last = first + source_count * source_dtp_size;
 
-        MPID_Datatype_get_ptr(acc_dtp, dtp);
-        vec_len = dtp->max_contig_blocks * acc_count + 1;
+        MPID_Datatype_get_ptr(target_dtp, dtp);
+        vec_len = dtp->max_contig_blocks * target_count + 1;
         /* +1 needed because Rob says so */
         dloop_vec = (DLOOP_VECTOR *)
             MPIU_Malloc(vec_len * sizeof(DLOOP_VECTOR));
@@ -841,12 +857,14 @@ static inline int do_accumulate_op(void *source_buf, void *target_buf,
         type = dtp->eltype;
         MPIU_Assert(type != MPI_DATATYPE_NULL);
 
-        MPID_Datatype_get_size_macro(type, type_size);
-        MPID_Datatype_get_extent_macro(type, type_extent);
+        MPIU_Assert(type == source_dtp);
+        type_size = source_dtp_size;
+        type_extent = source_dtp_extent;
 
         i = 0;
         curr_loc = dloop_vec[0].DLOOP_VECTOR_BUF;
         curr_len = dloop_vec[0].DLOOP_VECTOR_LEN;
+        accumulated_count = 0;
         while (i != vec_len) {
             if (curr_len < type_size) {
                 MPIU_Assert(i != vec_len);
@@ -856,8 +874,19 @@ static inline int do_accumulate_op(void *source_buf, void *target_buf,
             }
 
             MPIU_Assign_trunc(count, curr_len / type_size, int);
-            (*uop) ((char *) source_buf + MPIU_PtrToAint(curr_loc),
-                    (char *) target_buf + MPIU_PtrToAint(curr_loc), &count, &type);
+
+            if (acc_op == MPI_REPLACE) {
+                mpi_errno = MPIR_Localcopy((char *) source_buf + type_extent * accumulated_count,
+                                           count, type,
+                                           (char *) target_buf + MPIU_PtrToAint(curr_loc),
+                                           count, type);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
+            else {
+                (*uop) ((char *) source_buf + type_extent * accumulated_count,
+                        (char *) target_buf + MPIU_PtrToAint(curr_loc), &count, &type);
+            }
 
             if (curr_len % type_size == 0) {
                 i++;
@@ -870,6 +899,8 @@ static inline int do_accumulate_op(void *source_buf, void *target_buf,
                 curr_loc = (void *) ((char *) curr_loc + type_extent * count);
                 curr_len -= type_size * count;
             }
+
+            accumulated_count += count;
         }
 
         MPID_Segment_free(segp);

http://git.mpich.org/mpich.git/commitdiff/c9435750d06e7a26418f3d018d1dfd8a75bee36c

commit c9435750d06e7a26418f3d018d1dfd8a75bee36c
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:24:26 2015 -0800

    Bug-fix: add FOP req types.
    
    This patch adds req types for FOP operation, and calls FOP req handler
    after SRBuf is unpacked.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index 0ec66b9..b9d6a75 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -465,6 +465,8 @@ extern MPIDI_Process_t MPIDI_Process;
 #define MPIDI_REQUEST_TYPE_GET_ACCUM_RECV 11             /* target is receiving GACC data */
 #define MPIDI_REQUEST_TYPE_GET_ACCUM_RECV_DERIVED_DT 12  /* target is receiving derived DT info for GACC data */
 #define MPIDI_REQUEST_TYPE_GET_ACCUM_RESP 13             /* target is sending GACC response data */
+#define MPIDI_REQUEST_TYPE_FOP_RECV 14                   /* target is receiving FOP data */
+#define MPIDI_REQUEST_TYPE_FOP_RESP 15                   /* target is sending FOP response data */
 
 
 #define MPIDI_Request_get_type(req_)						\
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 2696ceb..b6ecbbf 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -350,6 +350,8 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, i
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_FOPRECVCOMPLETE);
 
+    MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_FOP_RECV);
+
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
     MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
@@ -359,6 +361,7 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, i
     /* Create response request */
     resp_req = MPID_Request_create();
     MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+    MPIDI_Request_set_type(resp_req, MPIDI_REQUEST_TYPE_FOP_RESP);
     MPIU_Object_set_ref(resp_req, 1);
     resp_req->dev.OnFinal = MPIDI_CH3_ReqHandler_FOPSendComplete;
     resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_FOPSendComplete;
@@ -767,6 +770,9 @@ int MPIDI_CH3_ReqHandler_UnpackSRBufComplete(MPIDI_VC_t * vc, MPID_Request * rre
     else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RECV) {
         mpi_errno = MPIDI_CH3_ReqHandler_GaccumRecvComplete(vc, rreq, complete);
     }
+    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_FOP_RECV) {
+        mpi_errno = MPIDI_CH3_ReqHandler_FOPRecvComplete(vc, rreq, complete);
+    }
     else {
         /* mark data transfer as complete and decrement CC */
         MPIDI_CH3U_Request_complete(rreq);
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 250d6cd..c281573 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -1281,6 +1281,7 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
         req = MPID_Request_create();
         MPIU_Object_set_ref(req, 1);
+        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_FOP_RECV);
         *rreqp = req;
 
         req->dev.op = fop_pkt->op;

http://git.mpich.org/mpich.git/commitdiff/f75eb4eb666e1cbf249ab829e357ec1fcb8d49ca

commit f75eb4eb666e1cbf249ab829e357ec1fcb8d49ca
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:23:33 2015 -0800

    Correct the name of RMA requests types.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index 684835a..0ec66b9 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -456,16 +456,15 @@ extern MPIDI_Process_t MPIDI_Process;
 #define MPIDI_REQUEST_TYPE_SSEND 3
 /* We need a BSEND type for persistent bsends (see mpid_startall.c) */
 #define MPIDI_REQUEST_TYPE_BSEND 4
-#define MPIDI_REQUEST_TYPE_PUT_RESP 5
-#define MPIDI_REQUEST_TYPE_GET_RESP 6
-#define MPIDI_REQUEST_TYPE_ACCUM_RESP 7
-#define MPIDI_REQUEST_TYPE_PUT_RESP_DERIVED_DT 8
-#define MPIDI_REQUEST_TYPE_GET_RESP_DERIVED_DT 9
-#define MPIDI_REQUEST_TYPE_ACCUM_RESP_DERIVED_DT 10
-#define MPIDI_REQUEST_TYPE_PT_SINGLE_PUT 11
-#define MPIDI_REQUEST_TYPE_PT_SINGLE_ACCUM 12
-#define MPIDI_REQUEST_TYPE_GET_ACCUM_RESP 13
-#define MPIDI_REQUEST_TYPE_GET_ACCUM_RESP_DERIVED_DT 14
+#define MPIDI_REQUEST_TYPE_PUT_RECV 5                    /* target is receiving PUT data */
+#define MPIDI_REQUEST_TYPE_GET_RESP 6                    /* target is sending GET response data */
+#define MPIDI_REQUEST_TYPE_ACCUM_RECV 7                  /* target is receiving ACC data */
+#define MPIDI_REQUEST_TYPE_PUT_RECV_DERIVED_DT 8         /* target is receiving derived DT info for PUT data */
+#define MPIDI_REQUEST_TYPE_GET_RECV_DERIVED_DT 9         /* target is receiving derived DT info for GET data */
+#define MPIDI_REQUEST_TYPE_ACCUM_RECV_DERIVED_DT 10      /* target is receiving derived DT info for ACC data */
+#define MPIDI_REQUEST_TYPE_GET_ACCUM_RECV 11             /* target is receiving GACC data */
+#define MPIDI_REQUEST_TYPE_GET_ACCUM_RECV_DERIVED_DT 12  /* target is receiving derived DT info for GACC data */
+#define MPIDI_REQUEST_TYPE_GET_ACCUM_RESP 13             /* target is sending GACC response data */
 
 
 #define MPIDI_Request_get_type(req_)						\
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 0dd1db2..2696ceb 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -162,7 +162,7 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq,
 
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
-    MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP);
+    MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RECV);
 
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
@@ -246,6 +246,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     resp_req = MPID_Request_create();
     MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
     MPIU_Object_set_ref(resp_req, 1);
+    MPIDI_Request_set_type(resp_req, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
 
     MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, rreq->dev.user_count * type_size,
                         mpi_errno, "GACC resp. buffer");
@@ -307,7 +308,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     /* Mark get portion as handled */
     rreq->dev.resp_request_handle = MPI_REQUEST_NULL;
 
-    MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
+    MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RECV);
 
     /* free the temporary buffer */
     MPIR_Type_get_true_extent_impl(rreq->dev.datatype, &true_lb, &true_extent);
@@ -464,7 +465,7 @@ int MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((unu
     create_derived_datatype(rreq, &new_dtp);
 
     /* update request to get the data */
-    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_PUT_RESP);
+    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_PUT_RECV);
     rreq->dev.datatype = new_dtp->handle;
     rreq->dev.recv_data_sz = new_dtp->size * rreq->dev.user_count;
 
@@ -514,7 +515,7 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((u
     create_derived_datatype(rreq, &new_dtp);
 
     /* update new request to get the data */
-    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_ACCUM_RESP);
+    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_ACCUM_RECV);
 
     /* first need to allocate tmp_buf to recv the data into */
 
@@ -580,7 +581,7 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((
     create_derived_datatype(rreq, &new_dtp);
 
     /* update new request to get the data */
-    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
+    MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_GET_ACCUM_RECV);
 
     /* first need to allocate tmp_buf to recv the data into */
 
@@ -757,13 +758,13 @@ int MPIDI_CH3_ReqHandler_UnpackSRBufComplete(MPIDI_VC_t * vc, MPID_Request * rre
 
     MPIDI_CH3U_Request_unpack_srbuf(rreq);
 
-    if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_PUT_RESP) {
+    if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_PUT_RECV) {
         mpi_errno = MPIDI_CH3_ReqHandler_PutRecvComplete(vc, rreq, complete);
     }
-    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP) {
+    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RECV) {
         mpi_errno = MPIDI_CH3_ReqHandler_AccumRecvComplete(vc, rreq, complete);
     }
-    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RESP) {
+    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RECV) {
         mpi_errno = MPIDI_CH3_ReqHandler_GaccumRecvComplete(vc, rreq, complete);
     }
     else {
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 4264743..250d6cd 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -254,7 +254,7 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         req->dev.OnFinal = MPIDI_CH3_ReqHandler_PutRecvComplete;
 
         if (MPIR_DATATYPE_IS_PREDEFINED(put_pkt->datatype)) {
-            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RESP);
+            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RECV);
             req->dev.datatype = put_pkt->datatype;
 
             req->dev.recv_data_sz = type_size * put_pkt->count;
@@ -279,7 +279,7 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         }
         else {
             /* derived datatype */
-            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RESP_DERIVED_DT);
+            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RECV_DERIVED_DT);
             req->dev.datatype = MPI_DATATYPE_NULL;
 
             req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
@@ -500,7 +500,7 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     else {
         /* derived datatype. first get the dtype_info and dataloop. */
 
-        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_RESP_DERIVED_DT);
+        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_RECV_DERIVED_DT);
         req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete;
         req->dev.OnFinal = 0;
         req->dev.user_buf = get_pkt->addr;
@@ -645,7 +645,7 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
         if (MPIR_DATATYPE_IS_PREDEFINED(accum_pkt->datatype)) {
-            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RESP);
+            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RECV);
             req->dev.datatype = accum_pkt->datatype;
 
             MPIR_Type_get_true_extent_impl(accum_pkt->datatype, &true_lb, &true_extent);
@@ -684,7 +684,7 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             }
         }
         else {
-            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RESP_DERIVED_DT);
+            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RECV_DERIVED_DT);
             req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete;
             req->dev.datatype = MPI_DATATYPE_NULL;
 
@@ -888,7 +888,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
         if (MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype)) {
-            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
+            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RECV);
             req->dev.datatype = get_accum_pkt->datatype;
 
             MPIR_Type_get_true_extent_impl(get_accum_pkt->datatype, &true_lb, &true_extent);
@@ -926,7 +926,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             }
         }
         else {
-            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP_DERIVED_DT);
+            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RECV_DERIVED_DT);
             req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete;
             req->dev.datatype = MPI_DATATYPE_NULL;
 

http://git.mpich.org/mpich.git/commitdiff/d8eb8de278149d663b1ef191f10692a86fd3f834

commit d8eb8de278149d663b1ef191f10692a86fd3f834
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:22:24 2015 -0800

    Add stream_offset to ACC-related packets and request struct.
    
    Add stream_offset area into ACC-related packets and request struct
    to remember current stream unit's starting position in the entire
    target data.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index 502ad5d..0ea8520 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -730,6 +730,8 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         stream_size = MPIR_MIN(stream_elem_count * predefined_dtp_size, rest_len);
         rest_len -= stream_size;
 
+        accum_pkt->info.metadata.stream_offset = stream_offset;
+
         mpi_errno =
             issue_from_origin_buffer_stream(rma_op, vc, stream_offset, stream_size, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
@@ -937,6 +939,10 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         stream_size = MPIR_MIN(stream_elem_count * predefined_dtp_size, rest_len);
         rest_len -= stream_size;
 
+        get_accum_pkt->info.metadata.stream_offset = stream_offset;
+
+        resp_req->dev.stream_offset = stream_offset;
+
         mpi_errno =
             issue_from_origin_buffer_stream(rma_op, vc, stream_offset, stream_size, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index 0c4ff9e..ad7b841 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -522,16 +522,16 @@ MPIDI_CH3_PKT_DEFS
         err_ = MPI_SUCCESS;                                             \
         switch((pkt_).type) {                                           \
         case (MPIDI_CH3_PKT_PUT):                                       \
-            (pkt_).put.info.dataloop_size = (dataloop_size_);           \
+            (pkt_).put.info.metadata.dataloop_size = (dataloop_size_);  \
             break;                                                      \
         case (MPIDI_CH3_PKT_GET):                                       \
-            (pkt_).get.info.dataloop_size = (dataloop_size_);           \
+            (pkt_).get.info.metadata.dataloop_size = (dataloop_size_);  \
             break;                                                      \
         case (MPIDI_CH3_PKT_ACCUMULATE):                                \
-            (pkt_).accum.info.dataloop_size = (dataloop_size_);         \
+            (pkt_).accum.info.metadata.dataloop_size = (dataloop_size_); \
             break;                                                      \
         case (MPIDI_CH3_PKT_GET_ACCUM):                                 \
-            (pkt_).get_accum.info.dataloop_size = (dataloop_size_);     \
+            (pkt_).get_accum.info.metadata.dataloop_size = (dataloop_size_); \
             break;                                                      \
         default:                                                        \
             MPIU_ERR_SETANDJUMP1(err_, MPI_ERR_OTHER, "**invalidpkt", "**invalidpkt %d", (pkt_).type); \
@@ -594,7 +594,12 @@ typedef struct MPIDI_CH3_Pkt_put {
     MPI_Win target_win_handle;
     MPI_Win source_win_handle;
     union {
-        int dataloop_size;
+        /* note that we use struct here in order
+         * to consistently access dataloop_size
+         * by "pkt->info.metadata.dataloop_size". */
+        struct {
+            int dataloop_size;
+        } metadata;
         char data[MPIDI_RMA_IMMED_BYTES];
     } info;
 } MPIDI_CH3_Pkt_put_t;
@@ -608,8 +613,10 @@ typedef struct MPIDI_CH3_Pkt_get {
     struct {
         /* note that we use struct here in order
          * to consistently access dataloop_size
-         * by "pkt->info.dataloop_size". */
-        int dataloop_size;      /* for derived datatypes */
+         * by "pkt->info.metadata.dataloop_size". */
+        struct {
+            int dataloop_size;  /* for derived datatypes */
+        } metadata;
     } info;
     MPI_Request request_handle;
     MPI_Win target_win_handle;
@@ -640,7 +647,10 @@ typedef struct MPIDI_CH3_Pkt_accum {
     MPI_Win target_win_handle;
     MPI_Win source_win_handle;
     union {
-        int dataloop_size;
+        struct {
+            int dataloop_size;
+            MPI_Aint stream_offset;
+        } metadata;
         char data[MPIDI_RMA_IMMED_BYTES];
     } info;
 } MPIDI_CH3_Pkt_accum_t;
@@ -655,7 +665,10 @@ typedef struct MPIDI_CH3_Pkt_get_accum {
     MPI_Op op;
     MPI_Win target_win_handle;
     union {
-        int dataloop_size;
+        struct {
+            int dataloop_size;
+            MPI_Aint stream_offset;
+        } metadata;
         char data[MPIDI_RMA_IMMED_BYTES];
     } info;
 } MPIDI_CH3_Pkt_get_accum_t;
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index bd5e274..09d5d12 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -447,6 +447,9 @@ typedef struct MPIDI_Request {
     struct MPIDI_RMA_Lock_entry *lock_queue_entry;
     MPI_Request resp_request_handle; /* Handle for get_accumulate response */
 
+    MPI_Aint stream_offset; /* used when streaming ACC/GACC packets, specifying the start
+                               location of the current streaming unit. */
+
     MPIDI_REQUEST_SEQNUM
 
     /* Occasionally, when a message cannot be sent, we need to cache the
diff --git a/src/mpid/ch3/src/ch3u_request.c b/src/mpid/ch3/src/ch3u_request.c
index 910f297..65efbb6 100644
--- a/src/mpid/ch3/src/ch3u_request.c
+++ b/src/mpid/ch3/src/ch3u_request.c
@@ -94,6 +94,7 @@ MPID_Request * MPID_Request_create(void)
         req->dev.OnFinal           = NULL;
         req->dev.user_buf          = NULL;
         req->dev.drop_data         = FALSE;
+        req->dev.stream_offset     = 0;
 #ifdef MPIDI_CH3_REQUEST_INIT
 	MPIDI_CH3_REQUEST_INIT(req);
 #endif
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 0cc5c26..3482b8e 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -198,7 +198,7 @@ int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
             win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
         put_pkt->count = target_count;
         put_pkt->datatype = target_datatype;
-        put_pkt->info.dataloop_size = 0;
+        put_pkt->info.metadata.dataloop_size = 0;
         put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
         put_pkt->source_win_handle = win_ptr->handle;
         put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
@@ -387,7 +387,7 @@ int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
             win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
         get_pkt->count = target_count;
         get_pkt->datatype = target_datatype;
-        get_pkt->info.dataloop_size = 0;
+        get_pkt->info.metadata.dataloop_size = 0;
         get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
         get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
         if (use_immed_resp_pkt)
@@ -612,10 +612,11 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
             win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
         accum_pkt->count = target_count;
         accum_pkt->datatype = target_datatype;
-        accum_pkt->info.dataloop_size = 0;
+        accum_pkt->info.metadata.dataloop_size = 0;
         accum_pkt->op = op;
         accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
         accum_pkt->source_win_handle = win_ptr->handle;
+        accum_pkt->info.metadata.stream_offset = 0;
         accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
         if (use_immed_pkt) {
             void *src = (void *) origin_addr, *dest = (void *) (accum_pkt->info.data);
@@ -809,7 +810,7 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                 win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
             get_pkt->count = target_count;
             get_pkt->datatype = target_datatype;
-            get_pkt->info.dataloop_size = 0;
+            get_pkt->info.metadata.dataloop_size = 0;
             get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
             get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
             if (use_immed_resp_pkt == TRUE)
@@ -930,9 +931,10 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                 win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
             get_accum_pkt->count = target_count;
             get_accum_pkt->datatype = target_datatype;
-            get_accum_pkt->info.dataloop_size = 0;
+            get_accum_pkt->info.metadata.dataloop_size = 0;
             get_accum_pkt->op = op;
             get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
+            get_accum_pkt->info.metadata.stream_offset = 0;
             get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
             if (use_immed_pkt) {
                 void *src = (void *) origin_addr, *dest = (void *) (get_accum_pkt->info.data);
@@ -1344,7 +1346,7 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
                 win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
             get_pkt->count = 1;
             get_pkt->datatype = datatype;
-            get_pkt->info.dataloop_size = 0;
+            get_pkt->info.metadata.dataloop_size = 0;
             get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
             get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
             if (use_immed_resp_pkt == TRUE)
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 9061bb9..4264743 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -289,24 +289,24 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                                      "MPIDI_RMA_dtype_info");
             }
 
-            req->dev.dataloop = MPIU_Malloc(put_pkt->info.dataloop_size);
+            req->dev.dataloop = MPIU_Malloc(put_pkt->info.metadata.dataloop_size);
             if (!req->dev.dataloop) {
                 MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                     put_pkt->info.dataloop_size);
+                                     put_pkt->info.metadata.dataloop_size);
             }
 
             /* if we received all of the dtype_info and dataloop, copy it
              * now and call the handler, otherwise set the iov and let the
              * channel copy it */
-            if (data_len >= sizeof(MPIDI_RMA_dtype_info) + put_pkt->info.dataloop_size) {
+            if (data_len >= sizeof(MPIDI_RMA_dtype_info) + put_pkt->info.metadata.dataloop_size) {
                 /* copy all of dtype_info and dataloop */
                 MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
                 MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
-                            put_pkt->info.dataloop_size);
+                            put_pkt->info.metadata.dataloop_size);
 
                 *buflen =
                     sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) +
-                    put_pkt->info.dataloop_size;
+                    put_pkt->info.metadata.dataloop_size;
 
                 /* All dtype data has been received, call req handler */
                 mpi_errno = MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete(vc, req, &complete);
@@ -321,7 +321,7 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                 req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) req->dev.dtype_info);
                 req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
                 req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
-                req->dev.iov[1].MPID_IOV_LEN = put_pkt->info.dataloop_size;
+                req->dev.iov[1].MPID_IOV_LEN = put_pkt->info.metadata.dataloop_size;
                 req->dev.iov_count = 2;
 
                 *buflen = sizeof(MPIDI_CH3_Pkt_t);
@@ -515,24 +515,24 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                                  "MPIDI_RMA_dtype_info");
         }
 
-        req->dev.dataloop = MPIU_Malloc(get_pkt->info.dataloop_size);
+        req->dev.dataloop = MPIU_Malloc(get_pkt->info.metadata.dataloop_size);
         if (!req->dev.dataloop) {
             MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                 get_pkt->info.dataloop_size);
+                                 get_pkt->info.metadata.dataloop_size);
         }
 
         /* if we received all of the dtype_info and dataloop, copy it
          * now and call the handler, otherwise set the iov and let the
          * channel copy it */
-        if (data_len >= sizeof(MPIDI_RMA_dtype_info) + get_pkt->info.dataloop_size) {
+        if (data_len >= sizeof(MPIDI_RMA_dtype_info) + get_pkt->info.metadata.dataloop_size) {
             /* copy all of dtype_info and dataloop */
             MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
             MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
-                        get_pkt->info.dataloop_size);
+                        get_pkt->info.metadata.dataloop_size);
 
             *buflen =
                 sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) +
-                get_pkt->info.dataloop_size;
+                get_pkt->info.metadata.dataloop_size;
 
             /* All dtype data has been received, call req handler */
             mpi_errno = MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete(vc, req, &complete);
@@ -545,7 +545,7 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dtype_info;
             req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
             req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
-            req->dev.iov[1].MPID_IOV_LEN = get_pkt->info.dataloop_size;
+            req->dev.iov[1].MPID_IOV_LEN = get_pkt->info.metadata.dataloop_size;
             req->dev.iov_count = 2;
 
             *buflen = sizeof(MPIDI_CH3_Pkt_t);
@@ -635,6 +635,7 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         req->dev.target_win_handle = accum_pkt->target_win_handle;
         req->dev.source_win_handle = accum_pkt->source_win_handle;
         req->dev.flags = accum_pkt->flags;
+        req->dev.stream_offset = accum_pkt->info.metadata.stream_offset;
 
         req->dev.resp_request_handle = MPI_REQUEST_NULL;
         req->dev.OnFinal = MPIDI_CH3_ReqHandler_AccumRecvComplete;
@@ -694,21 +695,21 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                                      "MPIDI_RMA_dtype_info");
             }
 
-            req->dev.dataloop = MPIU_Malloc(accum_pkt->info.dataloop_size);
+            req->dev.dataloop = MPIU_Malloc(accum_pkt->info.metadata.dataloop_size);
             if (!req->dev.dataloop) {
                 MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                     accum_pkt->info.dataloop_size);
+                                     accum_pkt->info.metadata.dataloop_size);
             }
 
-            if (data_len >= sizeof(MPIDI_RMA_dtype_info) + accum_pkt->info.dataloop_size) {
+            if (data_len >= sizeof(MPIDI_RMA_dtype_info) + accum_pkt->info.metadata.dataloop_size) {
                 /* copy all of dtype_info and dataloop */
                 MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
                 MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
-                            accum_pkt->info.dataloop_size);
+                            accum_pkt->info.metadata.dataloop_size);
 
                 *buflen =
                     sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) +
-                    accum_pkt->info.dataloop_size;
+                    accum_pkt->info.metadata.dataloop_size;
 
                 /* All dtype data has been received, call req handler */
                 mpi_errno = MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(vc, req, &complete);
@@ -723,7 +724,7 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                 req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dtype_info;
                 req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
                 req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
-                req->dev.iov[1].MPID_IOV_LEN = accum_pkt->info.dataloop_size;
+                req->dev.iov[1].MPID_IOV_LEN = accum_pkt->info.metadata.dataloop_size;
                 req->dev.iov_count = 2;
                 *buflen = sizeof(MPIDI_CH3_Pkt_t);
             }
@@ -877,6 +878,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         req->dev.real_user_buf = get_accum_pkt->addr;
         req->dev.target_win_handle = get_accum_pkt->target_win_handle;
         req->dev.flags = get_accum_pkt->flags;
+        req->dev.stream_offset = get_accum_pkt->info.metadata.stream_offset;
 
         req->dev.resp_request_handle = get_accum_pkt->request_handle;
         req->dev.OnFinal = MPIDI_CH3_ReqHandler_GaccumRecvComplete;
@@ -935,21 +937,22 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                                      "MPIDI_RMA_dtype_info");
             }
 
-            req->dev.dataloop = MPIU_Malloc(get_accum_pkt->info.dataloop_size);
+            req->dev.dataloop = MPIU_Malloc(get_accum_pkt->info.metadata.dataloop_size);
             if (!req->dev.dataloop) {
                 MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                     get_accum_pkt->info.dataloop_size);
+                                     get_accum_pkt->info.metadata.dataloop_size);
             }
 
-            if (data_len >= sizeof(MPIDI_RMA_dtype_info) + get_accum_pkt->info.dataloop_size) {
+            if (data_len >=
+                sizeof(MPIDI_RMA_dtype_info) + get_accum_pkt->info.metadata.dataloop_size) {
                 /* copy all of dtype_info and dataloop */
                 MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
                 MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
-                            get_accum_pkt->info.dataloop_size);
+                            get_accum_pkt->info.metadata.dataloop_size);
 
                 *buflen =
                     sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) +
-                    get_accum_pkt->info.dataloop_size;
+                    get_accum_pkt->info.metadata.dataloop_size;
 
                 /* All dtype data has been received, call req handler */
                 mpi_errno = MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(vc, req, &complete);
@@ -964,7 +967,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                 req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dtype_info;
                 req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
                 req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
-                req->dev.iov[1].MPID_IOV_LEN = get_accum_pkt->info.dataloop_size;
+                req->dev.iov[1].MPID_IOV_LEN = get_accum_pkt->info.metadata.dataloop_size;
                 req->dev.iov_count = 2;
                 *buflen = sizeof(MPIDI_CH3_Pkt_t);
             }
@@ -1931,7 +1934,7 @@ int MPIDI_CH3_PktPrint_Put(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
     MPIU_DBG_PRINTF((" addr ......... %p\n", pkt->put.addr));
     MPIU_DBG_PRINTF((" count ........ %d\n", pkt->put.count));
     MPIU_DBG_PRINTF((" datatype ..... 0x%08X\n", pkt->put.datatype));
-    MPIU_DBG_PRINTF((" dataloop_size. 0x%08X\n", pkt->put.info.dataloop_size));
+    MPIU_DBG_PRINTF((" dataloop_size. 0x%08X\n", pkt->put.info.metadata.dataloop_size));
     MPIU_DBG_PRINTF((" target ....... 0x%08X\n", pkt->put.target_win_handle));
     MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->put.source_win_handle));
     /*MPIU_DBG_PRINTF((" win_ptr ...... 0x%08X\n", pkt->put.win_ptr)); */
@@ -1944,7 +1947,7 @@ int MPIDI_CH3_PktPrint_Get(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
     MPIU_DBG_PRINTF((" addr ......... %p\n", pkt->get.addr));
     MPIU_DBG_PRINTF((" count ........ %d\n", pkt->get.count));
     MPIU_DBG_PRINTF((" datatype ..... 0x%08X\n", pkt->get.datatype));
-    MPIU_DBG_PRINTF((" dataloop_size. %d\n", pkt->get.info.dataloop_size));
+    MPIU_DBG_PRINTF((" dataloop_size. %d\n", pkt->get.info.metadata.dataloop_size));
     MPIU_DBG_PRINTF((" request ...... 0x%08X\n", pkt->get.request_handle));
     MPIU_DBG_PRINTF((" target ....... 0x%08X\n", pkt->get.target_win_handle));
     MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->get.source_win_handle));
@@ -1969,7 +1972,7 @@ int MPIDI_CH3_PktPrint_Accumulate(FILE * fp, MPIDI_CH3_Pkt_t * pkt)
     MPIU_DBG_PRINTF((" addr ......... %p\n", pkt->accum.addr));
     MPIU_DBG_PRINTF((" count ........ %d\n", pkt->accum.count));
     MPIU_DBG_PRINTF((" datatype ..... 0x%08X\n", pkt->accum.datatype));
-    MPIU_DBG_PRINTF((" dataloop_size. %d\n", pkt->accum.info.dataloop_size));
+    MPIU_DBG_PRINTF((" dataloop_size. %d\n", pkt->accum.info.metadata.dataloop_size));
     MPIU_DBG_PRINTF((" op ........... 0x%08X\n", pkt->accum.op));
     MPIU_DBG_PRINTF((" target ....... 0x%08X\n", pkt->accum.target_win_handle));
     MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->accum.source_win_handle));

http://git.mpich.org/mpich.git/commitdiff/c986b927c32b09b98b1b740882c3bbf1a9aa04bc

commit c986b927c32b09b98b1b740882c3bbf1a9aa04bc
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:21:09 2015 -0800

    Add counter in op struct to remember number of stream units issued.
    
    Add a counter in op struct to remember number of stream units
    that have already been issued. For example, when the first stream
    unit piggybacked with LOCK is issued out, we temporarily stop
    issuing the following units. After the origin receives the ACK
    from the target, it can continue to issue the following units.
    This counter helps avoid issuing the first unit again.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index 3ec6042..502ad5d 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -706,10 +706,14 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);
 
     rest_len = total_len;
+    MPIU_Assert(rma_op->issued_stream_count >= 0);
     for (j = 0; j < stream_unit_count; j++) {
         MPIDI_msg_sz_t stream_offset, stream_size;
         MPID_Request *curr_req = NULL;
 
+        if (j < rma_op->issued_stream_count)
+            continue;
+
         accum_pkt->flags |= flags;
 
         if (j != 0) {
@@ -746,6 +750,8 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
             win_ptr->active_req_cnt++;
         }
 
+        rma_op->issued_stream_count++;
+
         if (accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
             accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
             /* if piggybacked with LOCK flag, we
@@ -755,6 +761,9 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         }
     }   /* end of for loop */
 
+    /* Mark that all stream units have been issued */
+    rma_op->issued_stream_count = -1;
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_ACC_OP);
     return mpi_errno;
@@ -877,11 +886,16 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     for (i = 0; i < rma_op->reqs_size; i++)
         rma_op->reqs[i] = NULL;
 
+    MPIU_Assert(rma_op->issued_stream_count >= 0);
+
     for (j = 0; j < stream_unit_count; j++) {
         MPIDI_msg_sz_t stream_offset, stream_size;
         MPID_Request *resp_req = NULL;
         MPID_Request *curr_req = NULL;
 
+        if (j < rma_op->issued_stream_count)
+            continue;
+
         get_accum_pkt->flags |= flags;
 
         if (j != 0) {
@@ -959,6 +973,8 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         rma_op->reqs[j] = curr_req;
         win_ptr->active_req_cnt++;
 
+        rma_op->issued_stream_count++;
+
         if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
             get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
             /* if piggybacked with LOCK flag, we
@@ -968,6 +984,9 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         }
     }   /* end of for loop */
 
+    /* Mark that all stream units have been issued */
+    rma_op->issued_stream_count = -1;
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_GET_ACC_OP);
     return mpi_errno;
diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index d145a92..e04a742 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -52,6 +52,7 @@ static inline MPIDI_RMA_Op_t *MPIDI_CH3I_Win_op_alloc(MPID_Win * win_ptr)
     e->ureq = NULL;
     e->is_dt = 0;
     e->piggyback_lock_candidate = 0;
+    e->issued_stream_count = 0;
 
     return e;
 }
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index ffdc0de..9495271 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -73,6 +73,9 @@ typedef struct MPIDI_RMA_Op {
     int is_dt;
     int piggyback_lock_candidate;
 
+    int issued_stream_count;    /* when >= 0, it specifies number of stream units that have been issued;
+                                 * when < 0, it means all stream units of this operation haven been issued. */
+
     MPID_Request *ureq;
 } MPIDI_RMA_Op_t;
 
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index f82763e..12ace6b 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -655,6 +655,9 @@ static inline int adjust_op_piggybacked_with_lock(MPID_Win * win_ptr,
             MPIDI_CH3_PKT_RMA_ERASE_FLAGS(op->pkt, mpi_errno);
 
             target->next_op_to_issue = op;
+
+            op->issued_stream_count = 0;
+
             if (op_flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
                 target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
             else if (op_flags & MPIDI_RMA_SYNC_UNLOCK)
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index 64a361f..de3c0d2 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -370,7 +370,16 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
                                          * PUT/ACC operation. */
         }
 
-        target->next_op_to_issue = curr_op->next;
+        if ((curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
+             curr_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM) && curr_op->issued_stream_count > 0) {
+            /* For ACC-like operations, if not all stream units
+             * are issued out, we stick to the current operation,
+             * otherwise we move on to the next operation. */
+            target->next_op_to_issue = curr_op;
+        }
+        else
+            target->next_op_to_issue = curr_op->next;
+
         if (target->next_op_to_issue == NULL) {
             if (flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH || flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
                 /* We are done with ending sync, unset target's sync_flag. */

http://git.mpich.org/mpich.git/commitdiff/421f4359e9acd4a9ac986850b9f1c2028117febe

commit 421f4359e9acd4a9ac986850b9f1c2028117febe
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:20:11 2015 -0800

    Reset flags for stream unit within one RMA operation.
    
    For all stream units within one RMA operation, we only
    needs to piggyback flags for the first operation to the
    first stream unit, and piggyback flags for the last
    operation to the last stream unit.
    
    Note that for operations piggybacked with LOCK flag, we
    should just issue the first stream unit, and wait until
    we receive ACK from the target to decide if we continue
    to issue the following units, or  re-transmit the first
    unit.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index b68af87..3ec6042 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -712,6 +712,16 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
 
         accum_pkt->flags |= flags;
 
+        if (j != 0) {
+            accum_pkt->flags &= ~MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED;
+            accum_pkt->flags &= ~MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE;
+        }
+        if (j != stream_unit_count - 1) {
+            accum_pkt->flags &= ~MPIDI_CH3_PKT_FLAG_RMA_UNLOCK;
+            accum_pkt->flags &= ~MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
+            accum_pkt->flags &= ~MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER;
+        }
+
         stream_offset = j * stream_elem_count * predefined_dtp_size;
         stream_size = MPIR_MIN(stream_elem_count * predefined_dtp_size, rest_len);
         rest_len -= stream_size;
@@ -735,6 +745,14 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
             rma_op->reqs[j] = curr_req;
             win_ptr->active_req_cnt++;
         }
+
+        if (accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
+            accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
+            /* if piggybacked with LOCK flag, we
+             * only issue the first streaming unit */
+            MPIU_Assert(j == 0);
+            break;
+        }
     }   /* end of for loop */
 
   fn_exit:
@@ -866,6 +884,16 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
 
         get_accum_pkt->flags |= flags;
 
+        if (j != 0) {
+            get_accum_pkt->flags &= ~MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED;
+            get_accum_pkt->flags &= ~MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE;
+        }
+        if (j != stream_unit_count - 1) {
+            get_accum_pkt->flags &= ~MPIDI_CH3_PKT_FLAG_RMA_UNLOCK;
+            get_accum_pkt->flags &= ~MPIDI_CH3_PKT_FLAG_RMA_FLUSH;
+            get_accum_pkt->flags &= ~MPIDI_CH3_PKT_FLAG_RMA_DECR_AT_COUNTER;
+        }
+
         /* Create a request for the GACC response.  Store the response buf, count, and
          * datatype in it, and pass the request's handle in the GACC packet. When the
          * response comes from the target, it will contain the request handle. */
@@ -930,6 +958,14 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
 
         rma_op->reqs[j] = curr_req;
         win_ptr->active_req_cnt++;
+
+        if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
+            get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
+            /* if piggybacked with LOCK flag, we
+             * only issue the first streaming unit */
+            MPIU_Assert(j == 0);
+            break;
+        }
     }   /* end of for loop */
 
   fn_exit:

http://git.mpich.org/mpich.git/commitdiff/0641e2f192208f8c0c0049c5ff61bbd0b5d9d9bb

commit 0641e2f192208f8c0c0049c5ff61bbd0b5d9d9bb
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:19:21 2015 -0800

    Make sure we only copy dataloop once for each RMA operation.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index 8cd2423..b68af87 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -85,6 +85,7 @@ static int fill_in_derived_dtp_info(MPIDI_RMA_Op_t * rma_op, MPID_Datatype * dtp
     rma_op->dtype_info.has_sticky_ub = dtp->has_sticky_ub;
     rma_op->dtype_info.has_sticky_lb = dtp->has_sticky_lb;
 
+    MPIU_Assert(rma_op->dataloop == NULL);
     MPIU_CHKPMEM_MALLOC(rma_op->dataloop, void *, dtp->dataloop_size, mpi_errno, "dataloop");
 
     MPIU_Memcpy(rma_op->dataloop, dtp->dataloop, dtp->dataloop_size);

http://git.mpich.org/mpich.git/commitdiff/382b04c46893fa5aea9ed6516909f24a9dfda087

commit 382b04c46893fa5aea9ed6516909f24a9dfda087
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:18:24 2015 -0800

    Cutting ACC and GACC messages.
    
    In this patch, we define the size of streaming unit the same
    as the SRBuf size (256 * 1024 bytes), and cut the ACC/GACC packet
    according to this size. The streaming unit always contains
    complete basic type data and does not contain partial basic
    type data.
    
    Note that we also increment the ref counter of the pointer
    to the derived datatype since multiple streaming units within
    one RMA operation will refer to it.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index 815a6c5..8cd2423 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -10,6 +10,9 @@
 #include "mpl_utlist.h"
 #include "mpid_rma_types.h"
 
+/* define ACC stream size as the SRBuf size */
+#define MPIDI_CH3U_Acc_stream_size MPIDI_CH3U_SRBuf_size
+
 /* =========================================================== */
 /*                    auxiliary functions                      */
 /* =========================================================== */
@@ -636,49 +639,102 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPIDI_VC_t *vc = NULL;
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_accum_t *accum_pkt = &rma_op->pkt.accum;
-    MPID_Request *curr_req = NULL;
-    int i, curr_req_index = 0;
+    int i, j;
+    MPI_Aint stream_elem_count, stream_unit_count;
+    MPI_Aint predefined_dtp_size, predefined_dtp_extent, predefined_dtp_count;
+    MPI_Aint total_len, rest_len;
+    MPI_Aint origin_dtp_size;
+    MPID_Datatype *origin_dtp_ptr = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_ACC_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_ACC_OP);
 
-    accum_pkt->flags |= flags;
-
     MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
 
     if (rma_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE_IMMED) {
+        MPID_Request *curr_req = NULL;
+
+        accum_pkt->flags |= flags;
+
         /* All origin data is in packet header, issue the header. */
         MPIU_THREAD_CS_ENTER(CH3COMM, vc);
         mpi_errno = MPIDI_CH3_iStartMsg(vc, accum_pkt, sizeof(*accum_pkt), &curr_req);
         MPIU_THREAD_CS_EXIT(CH3COMM, vc);
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+        if (curr_req != NULL) {
+            MPIU_Assert(rma_op->reqs_size == 0 && rma_op->reqs == NULL);
+
+            rma_op->reqs_size = 1;
+
+            rma_op->reqs =
+                (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
+            for (i = 0; i < rma_op->reqs_size; i++)
+                rma_op->reqs[i] = NULL;
+
+            rma_op->reqs[0] = curr_req;
+            win_ptr->active_req_cnt++;
+        }
+        goto fn_exit;
+    }
+
+    /* Get total length of origin data */
+    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_dtp_size);
+    total_len = origin_dtp_size * rma_op->origin_count;
+
+    /* Get size and count for predefined datatype elements */
+    if (MPIR_DATATYPE_IS_PREDEFINED(rma_op->origin_datatype)) {
+        predefined_dtp_size = origin_dtp_size;
+        predefined_dtp_count = rma_op->origin_count;
+        MPID_Datatype_get_extent_macro(rma_op->origin_datatype, predefined_dtp_extent);
     }
     else {
+        MPID_Datatype_get_ptr(rma_op->origin_datatype, origin_dtp_ptr);
+        MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->eltype != MPI_DATATYPE_NULL);
+        MPID_Datatype_get_size_macro(origin_dtp_ptr->eltype, predefined_dtp_size);
+        predefined_dtp_count = total_len / predefined_dtp_size;
+        MPID_Datatype_get_extent_macro(origin_dtp_ptr->eltype, predefined_dtp_extent);
+    }
+    MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0);
+
+    /* Calculate number of predefined elements in each stream unit, and
+     * total number of stream units. */
+    stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
+    stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
+    MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);
+
+    rest_len = total_len;
+    for (j = 0; j < stream_unit_count; j++) {
         MPIDI_msg_sz_t stream_offset, stream_size;
-        MPI_Aint origin_type_size;
+        MPID_Request *curr_req = NULL;
 
-        MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
+        accum_pkt->flags |= flags;
 
-        stream_offset = 0;
-        stream_size = origin_type_size * rma_op->origin_count;
+        stream_offset = j * stream_elem_count * predefined_dtp_size;
+        stream_size = MPIR_MIN(stream_elem_count * predefined_dtp_size, rest_len);
+        rest_len -= stream_size;
 
         mpi_errno =
             issue_from_origin_buffer_stream(rma_op, vc, stream_offset, stream_size, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
-    }
 
-    if (curr_req != NULL) {
-        rma_op->reqs_size = 1;
+        if (curr_req != NULL) {
+            if (rma_op->reqs_size == 0) {
+                MPIU_Assert(rma_op->reqs == NULL);
+                rma_op->reqs_size = stream_unit_count;
 
-        rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
-        for (i = 0; i < rma_op->reqs_size; i++)
-            rma_op->reqs[i] = NULL;
+                rma_op->reqs =
+                    (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
+                for (i = 0; i < rma_op->reqs_size; i++)
+                    rma_op->reqs[i] = NULL;
+            }
 
-        rma_op->reqs[curr_req_index] = curr_req;
-        win_ptr->active_req_cnt++;
-    }
+            rma_op->reqs[j] = curr_req;
+            win_ptr->active_req_cnt++;
+        }
+    }   /* end of for loop */
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_ACC_OP);
@@ -704,102 +760,176 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPIDI_VC_t *vc = NULL;
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &rma_op->pkt.get_accum;
-    MPID_Request *resp_req = NULL;
-    MPID_Request *curr_req = NULL;
-    int i, curr_req_index = 0;
+    int i, j;
+    MPI_Aint stream_elem_count, stream_unit_count;
+    MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
+    MPI_Aint total_len, rest_len;
+    MPI_Aint origin_dtp_size;
+    MPID_Datatype *origin_dtp_ptr = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_GET_ACC_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_GET_ACC_OP);
 
-    rma_op->reqs_size = 1;
+    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
 
-    rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
-    for (i = 0; i < rma_op->reqs_size; i++)
-        rma_op->reqs[i] = NULL;
+    if (rma_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {
+        MPID_Request *resp_req = NULL;
+        MPID_Request *curr_req = NULL;
 
-    /* Create a request for the GACC response.  Store the response buf, count, and
-     * datatype in it, and pass the request's handle in the GACC packet. When the
-     * response comes from the target, it will contain the request handle. */
-    resp_req = MPID_Request_create();
-    MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+        get_accum_pkt->flags |= flags;
 
-    MPIU_Object_set_ref(resp_req, 2);
+        rma_op->reqs_size = 1;
 
-    resp_req->dev.user_buf = rma_op->result_addr;
-    resp_req->dev.user_count = rma_op->result_count;
-    resp_req->dev.datatype = rma_op->result_datatype;
-    resp_req->dev.target_win_handle = MPI_WIN_NULL;
-    resp_req->dev.source_win_handle = win_ptr->handle;
+        rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
+        for (i = 0; i < rma_op->reqs_size; i++)
+            rma_op->reqs[i] = NULL;
 
-    if (!MPIR_DATATYPE_IS_PREDEFINED(resp_req->dev.datatype)) {
-        MPID_Datatype *result_dtp = NULL;
-        MPID_Datatype_get_ptr(resp_req->dev.datatype, result_dtp);
-        resp_req->dev.datatype_ptr = result_dtp;
-        /* this will cause the datatype to be freed when the
-         * request is freed. */
-    }
+        /* Create a request for the GACC response.  Store the response buf, count, and
+         * datatype in it, and pass the request's handle in the GACC packet. When the
+         * response comes from the target, it will contain the request handle. */
+        resp_req = MPID_Request_create();
+        MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
 
-    /* Note: Get_accumulate uses the same packet type as accumulate */
-    get_accum_pkt->request_handle = resp_req->handle;
+        MPIU_Object_set_ref(resp_req, 2);
 
-    get_accum_pkt->flags |= flags;
+        resp_req->dev.user_buf = rma_op->result_addr;
+        resp_req->dev.user_count = rma_op->result_count;
+        resp_req->dev.datatype = rma_op->result_datatype;
+        resp_req->dev.target_win_handle = MPI_WIN_NULL;
+        resp_req->dev.source_win_handle = win_ptr->handle;
 
-    MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
+        /* Note: Get_accumulate uses the same packet type as accumulate */
+        get_accum_pkt->request_handle = resp_req->handle;
 
-    if (rma_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {
         /* All origin data is in packet header, issue the header. */
         MPIU_THREAD_CS_ENTER(CH3COMM, vc);
         mpi_errno = MPIDI_CH3_iStartMsg(vc, get_accum_pkt, sizeof(*get_accum_pkt), &curr_req);
         MPIU_THREAD_CS_EXIT(CH3COMM, vc);
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+        if (curr_req != NULL) {
+            MPID_Request_release(curr_req);
+            curr_req = resp_req;
+        }
+        else {
+            curr_req = resp_req;
+        }
+
+        /* For error checking */
+        resp_req = NULL;
+
+        rma_op->reqs[0] = curr_req;
+        win_ptr->active_req_cnt++;
+
+        goto fn_exit;
+    }
+
+    /* Get total length of origin data */
+    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_dtp_size);
+    total_len = origin_dtp_size * rma_op->origin_count;
+
+    /* Get size and count for predefined datatype elements */
+    if (MPIR_DATATYPE_IS_PREDEFINED(rma_op->origin_datatype)) {
+        predefined_dtp_size = origin_dtp_size;
+        predefined_dtp_count = rma_op->origin_count;
+        MPID_Datatype_get_extent_macro(rma_op->origin_datatype, predefined_dtp_extent);
     }
     else {
+        MPID_Datatype_get_ptr(rma_op->origin_datatype, origin_dtp_ptr);
+        MPIU_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->eltype != MPI_DATATYPE_NULL);
+        MPID_Datatype_get_size_macro(origin_dtp_ptr->eltype, predefined_dtp_size);
+        predefined_dtp_count = total_len / predefined_dtp_size;
+        MPID_Datatype_get_extent_macro(origin_dtp_ptr->eltype, predefined_dtp_extent);
+    }
+    MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0);
+
+    /* Calculate number of predefined elements in each stream unit, and
+     * total number of stream units. */
+    stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
+    stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
+    MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);
+
+    rest_len = total_len;
+
+    rma_op->reqs_size = stream_unit_count;
+
+    rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
+    for (i = 0; i < rma_op->reqs_size; i++)
+        rma_op->reqs[i] = NULL;
+
+    for (j = 0; j < stream_unit_count; j++) {
         MPIDI_msg_sz_t stream_offset, stream_size;
-        MPI_Aint origin_type_size;
+        MPID_Request *resp_req = NULL;
+        MPID_Request *curr_req = NULL;
+
+        get_accum_pkt->flags |= flags;
+
+        /* Create a request for the GACC response.  Store the response buf, count, and
+         * datatype in it, and pass the request's handle in the GACC packet. When the
+         * response comes from the target, it will contain the request handle. */
+        resp_req = MPID_Request_create();
+        MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+
+        MPIU_Object_set_ref(resp_req, 2);
+
+        resp_req->dev.user_buf = rma_op->result_addr;
+        resp_req->dev.user_count = rma_op->result_count;
+        resp_req->dev.datatype = rma_op->result_datatype;
+        resp_req->dev.target_win_handle = MPI_WIN_NULL;
+        resp_req->dev.source_win_handle = win_ptr->handle;
+
+        if (!MPIR_DATATYPE_IS_PREDEFINED(resp_req->dev.datatype)) {
+            MPID_Datatype *result_dtp = NULL;
+            MPID_Datatype_get_ptr(resp_req->dev.datatype, result_dtp);
+            resp_req->dev.datatype_ptr = result_dtp;
+            /* this will cause the datatype to be freed when the
+             * request is freed. */
+        }
 
-        MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
+        /* Note: Get_accumulate uses the same packet type as accumulate */
+        get_accum_pkt->request_handle = resp_req->handle;
 
-        stream_offset = 0;
-        stream_size = origin_type_size * rma_op->origin_count;
+        stream_offset = j * stream_elem_count * predefined_dtp_size;
+        stream_size = MPIR_MIN(stream_elem_count * predefined_dtp_size, rest_len);
+        rest_len -= stream_size;
 
         mpi_errno =
             issue_from_origin_buffer_stream(rma_op, vc, stream_offset, stream_size, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
-    }
 
-    /* This operation can generate two requests; one for inbound and one for
-     * outbound data. */
-    if (curr_req != NULL) {
-        /* If we have both inbound and outbound requests (i.e. GACC
-         * operation), we need to ensure that the source buffer is
-         * available and that the response data has been received before
-         * informing the origin that this operation is complete.  Because
-         * the update needs to be done atomically at the target, they will
-         * not send back data until it has been received.  Therefore,
-         * completion of the response request implies that the send request
-         * has completed.
-         *
-         * Therefore: refs on the response request are set to two: one is
-         * held by the progress engine and the other by the RMA op
-         * completion code.  Refs on the outbound request are set to one;
-         * it will be completed by the progress engine.
-         */
-
-        MPID_Request_release(curr_req);
-        curr_req = resp_req;
-
-    }
-    else {
-        curr_req = resp_req;
-    }
+        /* This operation can generate two requests; one for inbound and one for
+         * outbound data. */
+        if (curr_req != NULL) {
+            /* If we have both inbound and outbound requests (i.e. GACC
+             * operation), we need to ensure that the source buffer is
+             * available and that the response data has been received before
+             * informing the origin that this operation is complete.  Because
+             * the update needs to be done atomically at the target, they will
+             * not send back data until it has been received.  Therefore,
+             * completion of the response request implies that the send request
+             * has completed.
+             *
+             * Therefore: refs on the response request are set to two: one is
+             * held by the progress engine and the other by the RMA op
+             * completion code.  Refs on the outbound request are set to one;
+             * it will be completed by the progress engine.
+             */
+
+            MPID_Request_release(curr_req);
+            curr_req = resp_req;
+        }
+        else {
+            curr_req = resp_req;
+        }
 
-    /* For error checking */
-    resp_req = NULL;
+        /* For error checking */
+        resp_req = NULL;
 
-    rma_op->reqs[curr_req_index] = curr_req;
-    win_ptr->active_req_cnt++;
+        rma_op->reqs[j] = curr_req;
+        win_ptr->active_req_cnt++;
+    }   /* end of for loop */
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_GET_ACC_OP);
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index bbf3d80..0cc5c26 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -508,6 +508,10 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
         size_t immed_len, len;
         int use_immed_pkt = FALSE;
         int is_origin_contig, is_target_contig;
+        MPI_Aint stream_elem_count, stream_unit_count;
+        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
+        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL;
+        int i;
 
         /* queue it up */
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
@@ -531,19 +535,47 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
         /* if source or target datatypes are derived, increment their
          * reference counts */
         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
-            MPID_Datatype_get_ptr(origin_datatype, dtp);
-            MPID_Datatype_add_ref(dtp);
+            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
             new_ptr->is_dt = 1;
         }
         if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
-            MPID_Datatype_get_ptr(target_datatype, dtp);
-            MPID_Datatype_add_ref(dtp);
+            MPID_Datatype_get_ptr(target_datatype, target_dtp);
             new_ptr->is_dt = 1;
         }
 
         MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
         MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);
 
+        /* Get size and count for predefined datatype elements */
+        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
+            predefined_dtp_size = origin_type_size;
+            predefined_dtp_count = origin_count;
+            MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
+        }
+        else {
+            MPIU_Assert(origin_dtp->eltype != MPI_DATATYPE_NULL);
+            MPID_Datatype_get_size_macro(origin_dtp->eltype, predefined_dtp_size);
+            predefined_dtp_count = len / predefined_dtp_size;
+            MPID_Datatype_get_extent_macro(origin_dtp->eltype, predefined_dtp_extent);
+        }
+        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
+                    predefined_dtp_extent > 0);
+
+        /* Calculate number of predefined elements in each stream unit, and
+         * total number of stream units. */
+        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
+        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
+        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);
+
+        for (i = 0; i < stream_unit_count; i++) {
+            if (origin_dtp != NULL) {
+                MPID_Datatype_add_ref(origin_dtp);
+            }
+            if (target_dtp != NULL) {
+                MPID_Datatype_add_ref(target_dtp);
+            }
+        }
+
         MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
         MPID_Datatype_is_contig(target_datatype, &is_target_contig);
 
@@ -790,6 +822,10 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             size_t immed_len, orig_len;
             int use_immed_pkt = FALSE;
             int is_origin_contig, is_target_contig, is_result_contig;
+            MPI_Aint stream_elem_count, stream_unit_count;
+            MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
+            MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
+            int i;
 
             /******************** Setting operation struct areas ***********************/
 
@@ -809,24 +845,54 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             /* if source or target datatypes are derived, increment their
              * reference counts */
             if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
-                MPID_Datatype_get_ptr(origin_datatype, dtp);
-                MPID_Datatype_add_ref(dtp);
+                MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
                 new_ptr->is_dt = 1;
             }
             if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
-                MPID_Datatype_get_ptr(result_datatype, dtp);
-                MPID_Datatype_add_ref(dtp);
+                MPID_Datatype_get_ptr(result_datatype, target_dtp);
                 new_ptr->is_dt = 1;
             }
             if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
-                MPID_Datatype_get_ptr(target_datatype, dtp);
-                MPID_Datatype_add_ref(dtp);
+                MPID_Datatype_get_ptr(target_datatype, result_dtp);
                 new_ptr->is_dt = 1;
             }
 
             MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
             MPIU_Assign_trunc(orig_len, origin_count * origin_type_size, size_t);
 
+            /* Get size and count for predefined datatype elements */
+            if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
+                predefined_dtp_size = origin_type_size;
+                predefined_dtp_count = origin_count;
+                MPID_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
+            }
+            else {
+                MPIU_Assert(origin_dtp->eltype != MPI_DATATYPE_NULL);
+                MPID_Datatype_get_size_macro(origin_dtp->eltype, predefined_dtp_size);
+                predefined_dtp_count = orig_len / predefined_dtp_size;
+                MPID_Datatype_get_extent_macro(origin_dtp->eltype, predefined_dtp_extent);
+            }
+            MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
+                        predefined_dtp_extent > 0);
+
+            /* Calculate number of predefined elements in each stream unit, and
+             * total number of stream units. */
+            stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
+            stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
+            MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);
+
+            for (i = 0; i < stream_unit_count; i++) {
+                if (origin_dtp != NULL) {
+                    MPID_Datatype_add_ref(origin_dtp);
+                }
+                if (target_dtp != NULL) {
+                    MPID_Datatype_add_ref(target_dtp);
+                }
+                if (result_dtp != NULL) {
+                    MPID_Datatype_add_ref(result_dtp);
+                }
+            }
+
             MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
             MPID_Datatype_is_contig(target_datatype, &is_target_contig);
             MPID_Datatype_is_contig(result_datatype, &is_result_contig);

http://git.mpich.org/mpich.git/commitdiff/d3cbeab3e73089ffb8669bf10b26a5fdeffc04b1

commit d3cbeab3e73089ffb8669bf10b26a5fdeffc04b1
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:17:33 2015 -0800

    Split issue_from_origin_buffer into normal and stream version.
    
    The stream version of issue_from_origin_buffer is used in ACC/GACC
    operations. It allows the user to stream the data by passing
    stream_offset and stream_size to the function.
    
    The normal version of issue_from_origin_buffer is used in other
    RMA operations. It issue all the data as a whole.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index a82e78a..815a6c5 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -344,6 +344,226 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
 }
 
 
+#undef FUNCNAME
+#define FUNCNAME issue_from_origin_buffer_stream
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int issue_from_origin_buffer_stream(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
+                                           MPIDI_msg_sz_t stream_offset, MPIDI_msg_sz_t stream_size,
+                                           MPID_Request ** req_ptr)
+{
+    MPI_Datatype target_datatype;
+    MPID_Datatype *target_dtp = NULL, *origin_dtp = NULL;
+    int is_origin_contig;
+    MPID_IOV iov[MPID_IOV_LIMIT];
+    MPID_Request *req = NULL;
+    int count;
+    int *ints = NULL;
+    int *blocklens = NULL;
+    MPI_Aint *displaces = NULL;
+    MPI_Datatype *datatypes = NULL;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER_STREAM);
+
+    MPIDI_FUNC_ENTER(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER_STREAM);
+
+    /* Judge if target datatype is derived datatype. */
+    MPIDI_CH3_PKT_RMA_GET_TARGET_DATATYPE(rma_op->pkt, target_datatype, mpi_errno);
+    if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
+        MPID_Datatype_get_ptr(target_datatype, target_dtp);
+
+        if (rma_op->dataloop == NULL) {
+            /* Fill derived datatype info. */
+            mpi_errno = fill_in_derived_dtp_info(rma_op, target_dtp);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+
+            /* Set dataloop size in pkt header */
+            MPIDI_CH3_PKT_RMA_SET_DATALOOP_SIZE(rma_op->pkt, target_dtp->dataloop_size, mpi_errno);
+        }
+    }
+
+    /* Judge if origin datatype is derived datatype. */
+    if (!MPIR_DATATYPE_IS_PREDEFINED(rma_op->origin_datatype)) {
+        MPID_Datatype_get_ptr(rma_op->origin_datatype, origin_dtp);
+    }
+
+    MPID_Datatype_is_contig(rma_op->origin_datatype, &is_origin_contig);
+
+    iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) & (rma_op->pkt);
+    iov[0].MPID_IOV_LEN = sizeof(rma_op->pkt);
+
+    if (target_dtp == NULL) {
+        /* basic datatype on target */
+        if (is_origin_contig) {
+            /* basic datatype on origin */
+            int iovcnt = 2;
+
+            iov[1].MPID_IOV_BUF =
+                (MPID_IOV_BUF_CAST) ((char *) rma_op->origin_addr + stream_offset);
+            iov[1].MPID_IOV_LEN = stream_size;
+
+            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+            mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, iovcnt, &req);
+            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+            if (origin_dtp != NULL) {
+                if (req == NULL) {
+                    MPID_Datatype_release(origin_dtp);
+                }
+                else {
+                    /* this will cause the datatype to be freed when the request
+                     * is freed. */
+                    req->dev.datatype_ptr = origin_dtp;
+                }
+            }
+        }
+        else {
+            /* derived datatype on origin */
+            req = MPID_Request_create();
+            MPIU_ERR_CHKANDJUMP(req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+
+            MPIU_Object_set_ref(req, 2);
+            req->kind = MPID_REQUEST_SEND;
+
+            req->dev.segment_ptr = MPID_Segment_alloc();
+            MPIU_ERR_CHKANDJUMP1(req->dev.segment_ptr == NULL, mpi_errno,
+                                 MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
+
+            if (origin_dtp != NULL) {
+                req->dev.datatype_ptr = origin_dtp;
+                /* this will cause the datatype to be freed when the request
+                 * is freed. */
+            }
+            MPID_Segment_init(rma_op->origin_addr, rma_op->origin_count,
+                              rma_op->origin_datatype, req->dev.segment_ptr, 0);
+            req->dev.segment_first = stream_offset;
+            req->dev.segment_size = stream_offset + stream_size;
+
+            req->dev.OnFinal = 0;
+            req->dev.OnDataAvail = 0;
+
+            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+            mpi_errno = vc->sendNoncontig_fn(vc, req, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
+            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+        }
+    }
+    else {
+        /* derived datatype on target */
+        MPID_Datatype *combined_dtp = NULL;
+        MPID_Segment *segp = NULL;
+        DLOOP_VECTOR *dloop_vec = NULL;
+        MPID_Datatype *dtp = NULL;
+        int vec_len, i;
+        MPIDI_msg_sz_t first = stream_offset;
+        MPIDI_msg_sz_t last = stream_offset + stream_size;
+
+        req = MPID_Request_create();
+        if (req == NULL) {
+            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+        }
+
+        MPIU_Object_set_ref(req, 2);
+        req->kind = MPID_REQUEST_SEND;
+
+        req->dev.segment_ptr = MPID_Segment_alloc();
+        MPIU_ERR_CHKANDJUMP1(req->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER,
+                             "**nomem", "**nomem %s", "MPID_Segment_alloc");
+
+        /* create a new datatype containing the dtype_info, dataloop, and origin data */
+        segp = MPID_Segment_alloc();
+        MPIU_ERR_CHKANDJUMP1(segp == NULL, mpi_errno, MPI_ERR_OTHER,
+                             "**nomem", "**nomem %s", "MPID_Segment_alloc");
+
+        MPID_Segment_init(rma_op->origin_addr, rma_op->origin_count, rma_op->origin_datatype, segp,
+                          0);
+
+        MPID_Datatype_get_ptr(rma_op->origin_datatype, dtp);
+        vec_len = dtp->max_contig_blocks * rma_op->origin_count + 1;
+        dloop_vec = (DLOOP_VECTOR *) MPIU_Malloc(vec_len * sizeof(DLOOP_VECTOR));
+        /* --BEGIN ERROR HANDLING-- */
+        if (!dloop_vec) {
+            mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
+                                             FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0);
+            goto fn_fail;
+        }
+        /* --END ERROR HANDLING-- */
+
+        MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);
+
+        count = 2 + vec_len;
+
+        ints = (int *) MPIU_Malloc(sizeof(int) * (count + 1));
+        blocklens = &ints[1];
+        displaces = (MPI_Aint *) MPIU_Malloc(sizeof(MPI_Aint) * count);
+        datatypes = (MPI_Datatype *) MPIU_Malloc(sizeof(MPI_Datatype) * count);
+
+        ints[0] = count;
+
+        displaces[0] = MPIU_PtrToAint(&(rma_op->dtype_info));
+        blocklens[0] = sizeof(MPIDI_RMA_dtype_info);
+        datatypes[0] = MPI_BYTE;
+
+        displaces[1] = MPIU_PtrToAint(rma_op->dataloop);
+        MPIU_Assign_trunc(blocklens[1], target_dtp->dataloop_size, int);
+        datatypes[1] = MPI_BYTE;
+
+        for (i = 0; i < vec_len; i++) {
+            displaces[i + 2] = MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF);
+            MPIU_Assign_trunc(blocklens[i + 2], dloop_vec[i].DLOOP_VECTOR_LEN, int);
+            datatypes[i + 2] = MPI_BYTE;
+        }
+
+        MPID_Segment_free(segp);
+        MPIU_Free(dloop_vec);
+
+        mpi_errno = create_datatype(ints, displaces, datatypes, &combined_dtp);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+
+        req->dev.datatype_ptr = combined_dtp;
+        /* combined_datatype will be freed when request is freed */
+
+        MPID_Segment_init(MPI_BOTTOM, 1, combined_dtp->handle, req->dev.segment_ptr, 0);
+        req->dev.segment_first = 0;
+        req->dev.segment_size = combined_dtp->size;
+
+        req->dev.OnFinal = 0;
+        req->dev.OnDataAvail = 0;
+
+        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+        mpi_errno = vc->sendNoncontig_fn(vc, req, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
+        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+        MPIU_Free(ints);
+        MPIU_Free(displaces);
+        MPIU_Free(datatypes);
+
+        /* we're done with the datatypes */
+        if (origin_dtp != NULL)
+            MPID_Datatype_release(origin_dtp);
+        MPID_Datatype_release(target_dtp);
+    }
+
+    (*req_ptr) = req;
+
+  fn_exit:
+    MPIDI_FUNC_EXIT(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER_STREAM);
+    return mpi_errno;
+  fn_fail:
+    if ((*req_ptr)) {
+        if ((*req_ptr)->dev.datatype_ptr)
+            MPID_Datatype_release((*req_ptr)->dev.datatype_ptr);
+        MPID_Request_release((*req_ptr));
+    }
+    (*req_ptr) = NULL;
+    goto fn_exit;
+}
+
+
 /* issue_put_op() issues PUT packet header and data. */
 #undef FUNCNAME
 #define FUNCNAME issue_put_op
@@ -435,7 +655,16 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        mpi_errno = issue_from_origin_buffer(rma_op, vc, &curr_req);
+        MPIDI_msg_sz_t stream_offset, stream_size;
+        MPI_Aint origin_type_size;
+
+        MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
+
+        stream_offset = 0;
+        stream_size = origin_type_size * rma_op->origin_count;
+
+        mpi_errno =
+            issue_from_origin_buffer_stream(rma_op, vc, stream_offset, stream_size, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -526,7 +755,16 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        mpi_errno = issue_from_origin_buffer(rma_op, vc, &curr_req);
+        MPIDI_msg_sz_t stream_offset, stream_size;
+        MPI_Aint origin_type_size;
+
+        MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
+
+        stream_offset = 0;
+        stream_size = origin_type_size * rma_op->origin_count;
+
+        mpi_errno =
+            issue_from_origin_buffer_stream(rma_op, vc, stream_offset, stream_size, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }

http://git.mpich.org/mpich.git/commitdiff/ca223da02146c11283eb4f8b4b37b41e5a4b8fc8

commit ca223da02146c11283eb4f8b4b37b41e5a4b8fc8
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:16:44 2015 -0800

    Make create_datatype be able to create more general new datatype.
    
    The original implementation of create_datatype can only generate
    a new datatype that describes 'dtype_info + dataloop + one data
    layout'. It does not support generating 'dtype_info + dataloop +
    multiple data layouts'. This patch makes create_datatype function
    to achieve that purpose.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index 800615c..a82e78a 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -99,15 +99,11 @@ static int fill_in_derived_dtp_info(MPIDI_RMA_Op_t * rma_op, MPID_Datatype * dtp
 }
 
 
-/* create_datatype() creates a new struct datatype for the dtype_info
-   and the dataloop of the target datatype together with the user data */
 #undef FUNCNAME
 #define FUNCNAME create_datatype
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
-                           const void *dataloop, MPI_Aint dataloop_sz,
-                           const void *o_addr, int o_count, MPI_Datatype o_datatype,
+static int create_datatype(int *ints, MPI_Aint * displaces, MPI_Datatype * datatypes,
                            MPID_Datatype ** combined_dtp)
 {
     int mpi_errno = MPI_SUCCESS;
@@ -115,35 +111,17 @@ static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
      * blocklens array with count prepended to it.  So blocklens
      * points to the 2nd element of ints to avoid having to copy
      * blocklens into ints later. */
-    int ints[4];
     int *blocklens = &ints[1];
-    MPI_Aint displaces[3];
-    MPI_Datatype datatypes[3];
-    const int count = 3;
     MPI_Datatype combined_datatype;
+    int count = ints[0];
     MPIDI_STATE_DECL(MPID_STATE_CREATE_DATATYPE);
 
     MPIDI_FUNC_ENTER(MPID_STATE_CREATE_DATATYPE);
 
-    /* create datatype */
-    displaces[0] = MPIU_PtrToAint(dtype_info);
-    blocklens[0] = sizeof(*dtype_info);
-    datatypes[0] = MPI_BYTE;
-
-    displaces[1] = MPIU_PtrToAint(dataloop);
-    MPIU_Assign_trunc(blocklens[1], dataloop_sz, int);
-    datatypes[1] = MPI_BYTE;
-
-    displaces[2] = MPIU_PtrToAint(o_addr);
-    blocklens[2] = o_count;
-    datatypes[2] = o_datatype;
-
     mpi_errno = MPID_Type_struct(count, blocklens, displaces, datatypes, &combined_datatype);
     if (mpi_errno)
         MPIU_ERR_POP(mpi_errno);
 
-    ints[0] = count;
-
     MPID_Datatype_get_ptr(combined_datatype, *combined_dtp);
     mpi_errno = MPID_Datatype_set_contents(*combined_dtp, MPI_COMBINER_STRUCT, count + 1,       /* ints (cnt,blklen) */
                                            count,       /* aints (disps) */
@@ -192,6 +170,11 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
     int is_origin_contig;
     MPID_IOV iov[MPID_IOV_LIMIT];
     MPID_Request *req = NULL;
+    int count;
+    int *ints = NULL;
+    int *blocklens = NULL;
+    MPI_Aint *displaces = NULL;
+    MPI_Datatype *datatypes = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER);
 
@@ -296,10 +279,27 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
 
         /* create a new datatype containing the dtype_info, dataloop, and origin data */
 
-        mpi_errno = create_datatype(&rma_op->dtype_info, rma_op->dataloop,
-                                    target_dtp->dataloop_size,
-                                    rma_op->origin_addr, rma_op->origin_count,
-                                    rma_op->origin_datatype, &combined_dtp);
+        count = 3;
+        ints = (int *) MPIU_Malloc(sizeof(int) * (count + 1));
+        blocklens = &ints[1];
+        displaces = (MPI_Aint *) MPIU_Malloc(sizeof(MPI_Aint) * count);
+        datatypes = (MPI_Datatype *) MPIU_Malloc(sizeof(MPI_Datatype) * count);
+
+        ints[0] = count;
+
+        displaces[0] = MPIU_PtrToAint(&(rma_op->dtype_info));
+        blocklens[0] = sizeof(MPIDI_RMA_dtype_info);
+        datatypes[0] = MPI_BYTE;
+
+        displaces[1] = MPIU_PtrToAint(rma_op->dataloop);
+        MPIU_Assign_trunc(blocklens[1], target_dtp->dataloop_size, int);
+        datatypes[1] = MPI_BYTE;
+
+        displaces[2] = MPIU_PtrToAint(rma_op->origin_addr);
+        blocklens[2] = rma_op->origin_count;
+        datatypes[2] = rma_op->origin_datatype;
+
+        mpi_errno = create_datatype(ints, displaces, datatypes, &combined_dtp);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
 
@@ -318,6 +318,10 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
         MPIU_THREAD_CS_EXIT(CH3COMM, vc);
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
 
+        MPIU_Free(ints);
+        MPIU_Free(displaces);
+        MPIU_Free(datatypes);
+
         /* we're done with the datatypes */
         if (origin_dtp != NULL)
             MPID_Datatype_release(origin_dtp);

http://git.mpich.org/mpich.git/commitdiff/bb0e602c8ae6c5a4edd660813f7d5c84080b7618

commit bb0e602c8ae6c5a4edd660813f7d5c84080b7618
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Fri Feb 20 22:41:42 2015 -0800

    use MPIDI_CH3U_Request_complete to complete user request.
    
    In the request handler, we should use MPIDI_CH3U_Request_complete
    to complete user request instead of directly setting it
    to being completed. This is because when one operation is cut
    into several packets, we must wait until all packets
    to be completed to set the user request to be completed.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_handle_op_req.c b/src/mpid/ch3/src/ch3u_handle_op_req.c
index 9efb9fd..bca34d7 100644
--- a/src/mpid/ch3/src/ch3u_handle_op_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_op_req.c
@@ -27,8 +27,7 @@ int MPIDI_CH3_ReqHandler_ReqOpsComplete(MPIDI_VC_t * vc, MPID_Request * sreq, in
 
     /* Complete user request and release ref of completion handler.
      * Note that ch3 ref is released by later clean_up call. */
-    MPID_Request_set_completed(ureq);
-    MPID_Request_release(ureq);
+    MPIDI_CH3U_Request_complete(ureq);
 
     MPIDI_CH3U_Request_complete(sreq);
     *complete = TRUE;

http://git.mpich.org/mpich.git/commitdiff/ab8386e765337f7c7e67e4411aa42080508f0c63

commit ab8386e765337f7c7e67e4411aa42080508f0c63
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:15:34 2015 -0800

    Use a request array in RMA operation.
    
    Because we may cut one RMA operation into multiple packets,
    and each packet needs a request object to track the completion,
    here we use a request array instead of single request in
    RMA operation structure.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index 32caade..800615c 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -351,13 +351,13 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPIDI_VC_t *vc = NULL;
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_put_t *put_pkt = &rma_op->pkt.put;
+    MPID_Request *curr_req = NULL;
+    int i, curr_req_index = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_PUT_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_PUT_OP);
 
-    rma_op->request = NULL;
-
     put_pkt->flags |= flags;
 
     MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
@@ -365,24 +365,37 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     if (rma_op->pkt.type == MPIDI_CH3_PKT_PUT_IMMED) {
         /* All origin data is in packet header, issue the header. */
         MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iStartMsg(vc, put_pkt, sizeof(*put_pkt), &(rma_op->request));
+        mpi_errno = MPIDI_CH3_iStartMsg(vc, put_pkt, sizeof(*put_pkt), &curr_req);
         MPIU_THREAD_CS_EXIT(CH3COMM, vc);
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        mpi_errno = issue_from_origin_buffer(rma_op, vc, &(rma_op->request));
+        mpi_errno = issue_from_origin_buffer(rma_op, vc, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
 
-    if (rma_op->request != NULL)
+    if (curr_req != NULL) {
+        rma_op->reqs_size = 1;
+
+        rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
+        for (i = 0; i < rma_op->reqs_size; i++)
+            rma_op->reqs[i] = NULL;
+
+        rma_op->reqs[curr_req_index] = curr_req;
         win_ptr->active_req_cnt++;
+    }
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_PUT_OP);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
+    if (rma_op->reqs != NULL) {
+        MPIU_Free(rma_op->reqs);
+    }
+    rma_op->reqs = NULL;
+    rma_op->reqs_size = 0;
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -399,13 +412,13 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPIDI_VC_t *vc = NULL;
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_accum_t *accum_pkt = &rma_op->pkt.accum;
+    MPID_Request *curr_req = NULL;
+    int i, curr_req_index = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_ACC_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_ACC_OP);
 
-    rma_op->request = NULL;
-
     accum_pkt->flags |= flags;
 
     MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
@@ -413,23 +426,36 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     if (rma_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE_IMMED) {
         /* All origin data is in packet header, issue the header. */
         MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iStartMsg(vc, accum_pkt, sizeof(*accum_pkt), &(rma_op->request));
+        mpi_errno = MPIDI_CH3_iStartMsg(vc, accum_pkt, sizeof(*accum_pkt), &curr_req);
         MPIU_THREAD_CS_EXIT(CH3COMM, vc);
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        mpi_errno = issue_from_origin_buffer(rma_op, vc, &(rma_op->request));
+        mpi_errno = issue_from_origin_buffer(rma_op, vc, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
 
-    if (rma_op->request != NULL)
+    if (curr_req != NULL) {
+        rma_op->reqs_size = 1;
+
+        rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
+        for (i = 0; i < rma_op->reqs_size; i++)
+            rma_op->reqs[i] = NULL;
+
+        rma_op->reqs[curr_req_index] = curr_req;
         win_ptr->active_req_cnt++;
+    }
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_ACC_OP);
     return mpi_errno;
   fn_fail:
+    if (rma_op->reqs != NULL) {
+        MPIU_Free(rma_op->reqs);
+    }
+    rma_op->reqs = NULL;
+    rma_op->reqs_size = 0;
     goto fn_exit;
 }
 
@@ -446,12 +472,18 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &rma_op->pkt.get_accum;
     MPID_Request *resp_req = NULL;
+    MPID_Request *curr_req = NULL;
+    int i, curr_req_index = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_GET_ACC_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_GET_ACC_OP);
 
-    rma_op->request = NULL;
+    rma_op->reqs_size = 1;
+
+    rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
+    for (i = 0; i < rma_op->reqs_size; i++)
+        rma_op->reqs[i] = NULL;
 
     /* Create a request for the GACC response.  Store the response buf, count, and
      * datatype in it, and pass the request's handle in the GACC packet. When the
@@ -485,20 +517,19 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     if (rma_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {
         /* All origin data is in packet header, issue the header. */
         MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno =
-            MPIDI_CH3_iStartMsg(vc, get_accum_pkt, sizeof(*get_accum_pkt), &(rma_op->request));
+        mpi_errno = MPIDI_CH3_iStartMsg(vc, get_accum_pkt, sizeof(*get_accum_pkt), &curr_req);
         MPIU_THREAD_CS_EXIT(CH3COMM, vc);
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        mpi_errno = issue_from_origin_buffer(rma_op, vc, &(rma_op->request));
+        mpi_errno = issue_from_origin_buffer(rma_op, vc, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
 
     /* This operation can generate two requests; one for inbound and one for
      * outbound data. */
-    if (rma_op->request != NULL) {
+    if (curr_req != NULL) {
         /* If we have both inbound and outbound requests (i.e. GACC
          * operation), we need to ensure that the source buffer is
          * available and that the response data has been received before
@@ -514,28 +545,35 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
          * it will be completed by the progress engine.
          */
 
-        MPID_Request_release(rma_op->request);
-        rma_op->request = resp_req;
+        MPID_Request_release(curr_req);
+        curr_req = resp_req;
 
     }
     else {
-        rma_op->request = resp_req;
+        curr_req = resp_req;
     }
 
     /* For error checking */
     resp_req = NULL;
 
-    if (rma_op->request != NULL)
-        win_ptr->active_req_cnt++;
+    rma_op->reqs[curr_req_index] = curr_req;
+    win_ptr->active_req_cnt++;
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_GET_ACC_OP);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
-    if (resp_req != NULL) {
-        MPID_Request_release(resp_req);
+    for (i = 0; i < rma_op->reqs_size; i++) {
+        if (rma_op->reqs[i] != NULL) {
+            MPIDI_CH3_Request_destroy(rma_op->reqs[i]);
+        }
     }
+    if (rma_op->reqs != NULL) {
+        MPIU_Free(rma_op->reqs);
+    }
+    rma_op->reqs = NULL;
+    rma_op->reqs_size = 0;
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -555,35 +593,43 @@ static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     MPID_Datatype *dtp;
     MPI_Datatype target_datatype;
     MPID_Request *req = NULL;
+    MPID_Request *curr_req = NULL;
+    int i, curr_req_index = 0;
     MPID_IOV iov[MPID_IOV_LIMIT];
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_GET_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_GET_OP);
 
+    rma_op->reqs_size = 1;
+
+    rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
+    for (i = 0; i < rma_op->reqs_size; i++)
+        rma_op->reqs[i] = NULL;
+
     /* create a request, store the origin buf, cnt, datatype in it,
      * and pass a handle to it in the get packet. When the get
      * response comes from the target, it will contain the request
      * handle. */
-    rma_op->request = MPID_Request_create();
-    if (rma_op->request == NULL) {
+    curr_req = MPID_Request_create();
+    if (curr_req == NULL) {
         MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
     }
 
-    MPIU_Object_set_ref(rma_op->request, 2);
+    MPIU_Object_set_ref(curr_req, 2);
 
-    rma_op->request->dev.user_buf = rma_op->origin_addr;
-    rma_op->request->dev.user_count = rma_op->origin_count;
-    rma_op->request->dev.datatype = rma_op->origin_datatype;
-    rma_op->request->dev.target_win_handle = MPI_WIN_NULL;
-    rma_op->request->dev.source_win_handle = win_ptr->handle;
-    if (!MPIR_DATATYPE_IS_PREDEFINED(rma_op->request->dev.datatype)) {
-        MPID_Datatype_get_ptr(rma_op->request->dev.datatype, dtp);
-        rma_op->request->dev.datatype_ptr = dtp;
+    curr_req->dev.user_buf = rma_op->origin_addr;
+    curr_req->dev.user_count = rma_op->origin_count;
+    curr_req->dev.datatype = rma_op->origin_datatype;
+    curr_req->dev.target_win_handle = MPI_WIN_NULL;
+    curr_req->dev.source_win_handle = win_ptr->handle;
+    if (!MPIR_DATATYPE_IS_PREDEFINED(curr_req->dev.datatype)) {
+        MPID_Datatype_get_ptr(curr_req->dev.datatype, dtp);
+        curr_req->dev.datatype_ptr = dtp;
         /* this will cause the datatype to be freed when the
          * request is freed. */
     }
 
-    get_pkt->request_handle = rma_op->request->handle;
+    get_pkt->request_handle = curr_req->handle;
 
     get_pkt->flags |= flags;
 
@@ -633,14 +679,24 @@ static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPID_Request_release(req);
     }
 
-    if (rma_op->request != NULL)
-        win_ptr->active_req_cnt++;
+    rma_op->reqs[curr_req_index] = curr_req;
+    win_ptr->active_req_cnt++;
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_GET_OP);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
+    for (i = 0; i < rma_op->reqs_size; i++) {
+        if (rma_op->reqs[i] != NULL) {
+            MPIDI_CH3_Request_destroy(rma_op->reqs[i]);
+        }
+    }
+    if (rma_op->reqs != NULL) {
+        MPIU_Free(rma_op->reqs);
+    }
+    rma_op->reqs = NULL;
+    rma_op->reqs_size = 0;
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -658,28 +714,36 @@ static int issue_cas_op(MPIDI_RMA_Op_t * rma_op,
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_cas_t *cas_pkt = &rma_op->pkt.cas;
     MPID_Request *rmw_req = NULL;
+    MPID_Request *curr_req = NULL;
+    int i, curr_req_index = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_CAS_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_CAS_OP);
 
+    rma_op->reqs_size = 1;
+
+    rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
+    for (i = 0; i < rma_op->reqs_size; i++)
+        rma_op->reqs[i] = NULL;
+
     /* Create a request for the RMW response.  Store the origin buf, count, and
      * datatype in it, and pass the request's handle RMW packet. When the
      * response comes from the target, it will contain the request handle. */
-    rma_op->request = MPID_Request_create();
-    MPIU_ERR_CHKANDJUMP(rma_op->request == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+    curr_req = MPID_Request_create();
+    MPIU_ERR_CHKANDJUMP(curr_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
 
     /* Set refs on the request to 2: one for the response message, and one for
      * the partial completion handler */
-    MPIU_Object_set_ref(rma_op->request, 2);
+    MPIU_Object_set_ref(curr_req, 2);
 
-    rma_op->request->dev.user_buf = rma_op->result_addr;
-    rma_op->request->dev.datatype = rma_op->result_datatype;
+    curr_req->dev.user_buf = rma_op->result_addr;
+    curr_req->dev.datatype = rma_op->result_datatype;
 
-    rma_op->request->dev.target_win_handle = MPI_WIN_NULL;
-    rma_op->request->dev.source_win_handle = win_ptr->handle;
+    curr_req->dev.target_win_handle = MPI_WIN_NULL;
+    curr_req->dev.source_win_handle = win_ptr->handle;
 
-    cas_pkt->request_handle = rma_op->request->handle;
+    cas_pkt->request_handle = curr_req->handle;
     cas_pkt->flags |= flags;
 
     MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
@@ -692,21 +756,24 @@ static int issue_cas_op(MPIDI_RMA_Op_t * rma_op,
         MPID_Request_release(rmw_req);
     }
 
-    if (rma_op->request != NULL)
-        win_ptr->active_req_cnt++;
+    rma_op->reqs[curr_req_index] = curr_req;
+    win_ptr->active_req_cnt++;
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_CAS_OP);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
-    if (rma_op->request) {
-        MPID_Request_release(rma_op->request);
+    for (i = 0; i < rma_op->reqs_size; i++) {
+        if (rma_op->reqs[i] != NULL) {
+            MPIDI_CH3_Request_destroy(rma_op->reqs[i]);
+        }
     }
-    rma_op->request = NULL;
-    if (rmw_req) {
-        MPID_Request_release(rmw_req);
+    if (rma_op->reqs != NULL) {
+        MPIU_Free(rma_op->reqs);
     }
+    rma_op->reqs = NULL;
+    rma_op->reqs_size = 0;
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -724,12 +791,18 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_fop_t *fop_pkt = &rma_op->pkt.fop;
     MPID_Request *resp_req = NULL;
+    MPID_Request *curr_req = NULL;
+    int i, curr_req_index = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_FOP_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_ISSUE_FOP_OP);
 
-    rma_op->request = NULL;
+    rma_op->reqs_size = 1;
+
+    rma_op->reqs = (MPID_Request **) MPIU_Malloc(sizeof(MPID_Request *) * rma_op->reqs_size);
+    for (i = 0; i < rma_op->reqs_size; i++)
+        rma_op->reqs[i] = NULL;
 
     /* Create a request for the GACC response.  Store the response buf, count, and
      * datatype in it, and pass the request's handle in the GACC packet. When the
@@ -753,19 +826,19 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
     if (rma_op->pkt.type == MPIDI_CH3_PKT_FOP_IMMED) {
         /* All origin data is in packet header, issue the header. */
         MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_pkt, sizeof(*fop_pkt), &(rma_op->request));
+        mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_pkt, sizeof(*fop_pkt), &curr_req);
         MPIU_THREAD_CS_EXIT(CH3COMM, vc);
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        mpi_errno = issue_from_origin_buffer(rma_op, vc, &(rma_op->request));
+        mpi_errno = issue_from_origin_buffer(rma_op, vc, &curr_req);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
 
     /* This operation can generate two requests; one for inbound and one for
      * outbound data. */
-    if (rma_op->request != NULL) {
+    if (curr_req != NULL) {
         /* If we have both inbound and outbound requests (i.e. GACC
          * operation), we need to ensure that the source buffer is
          * available and that the response data has been received before
@@ -781,27 +854,34 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
          * it will be completed by the progress engine.
          */
 
-        MPID_Request_release(rma_op->request);
-        rma_op->request = resp_req;
+        MPID_Request_release(curr_req);
+        curr_req = resp_req;
     }
     else {
-        rma_op->request = resp_req;
+        curr_req = resp_req;
     }
 
     /* For error checking */
     resp_req = NULL;
 
-    if (rma_op->request != NULL)
-        win_ptr->active_req_cnt++;
+    rma_op->reqs[curr_req_index] = curr_req;
+    win_ptr->active_req_cnt++;
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_FOP_OP);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
-    if (resp_req != NULL) {
-        MPID_Request_release(resp_req);
+    for (i = 0; i < rma_op->reqs_size; i++) {
+        if (rma_op->reqs[i] != NULL) {
+            MPIDI_CH3_Request_destroy(rma_op->reqs[i]);
+        }
     }
+    if (rma_op->reqs != NULL) {
+        MPIU_Free(rma_op->reqs);
+    }
+    rma_op->reqs = NULL;
+    rma_op->reqs_size = 0;
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -866,6 +946,7 @@ static inline int issue_rma_op(MPIDI_RMA_Op_t * op_ptr, MPID_Win * win_ptr,
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static inline int set_user_req_after_issuing_op(MPIDI_RMA_Op_t * op)
 {
+    int i, incomplete_req_cnt = 0;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_SET_USER_REQ_AFTER_ISSUING_OP);
 
@@ -874,7 +955,8 @@ static inline int set_user_req_after_issuing_op(MPIDI_RMA_Op_t * op)
     if (op->ureq == NULL)
         goto fn_exit;
 
-    if (!op->request) {
+    if (op->reqs_size == 0) {
+        MPIU_Assert(op->reqs == NULL);
         /* Sending is completed immediately, complete user request
          * and release ch3 ref. */
 
@@ -885,33 +967,50 @@ static inline int set_user_req_after_issuing_op(MPIDI_RMA_Op_t * op)
     else {
         /* Sending is not completed immediately. */
 
-        /* Setup user request info in order to be completed following send request. */
-
-        /* Increase ref for completion handler */
-        MPIU_Object_add_ref(op->ureq);
-        op->request->dev.request_handle = op->ureq->handle;
+        for (i = 0; i < op->reqs_size; i++) {
+            if (op->reqs[i] == NULL || MPID_Request_is_complete(op->reqs[i]))
+                continue;
+
+            /* Setup user request info in order to be completed following send request. */
+            incomplete_req_cnt++;
+            MPID_cc_set(&(op->ureq->cc), incomplete_req_cnt);   /* increment CC counter */
+
+            op->reqs[i]->dev.request_handle = op->ureq->handle;
+
+            /* Setup user request completion handler.
+             *
+             * The handler is triggered when send request is completed at
+             * following places:
+             * - progress engine: complete PUT/ACC req.
+             * - GET/GET_ACC packet handler: complete GET/GET_ACC reqs.
+             *
+             * We always set OnFinal which should be called when sending or
+             * receiving the last segment. However, short put/acc ops are
+             * issued in one packet and the lower layer only check OnDataAvail
+             * so we have to set OnDataAvail as well.
+             *
+             * Note that a noncontig send also uses OnDataAvail to loop all
+             * segments but it must be changed to OnFinal when sending the
+             * last segment, so it is also correct for us.
+             *
+             * TODO: implement stack for overriding functions*/
+            if (op->reqs[i]->dev.OnDataAvail == NULL) {
+                op->reqs[i]->dev.OnDataAvail = MPIDI_CH3_ReqHandler_ReqOpsComplete;
+            }
+            op->reqs[i]->dev.OnFinal = MPIDI_CH3_ReqHandler_ReqOpsComplete;
+        }       /* end of for loop */
 
-        /* Setup user request completion handler.
-         *
-         * The handler is triggered when send request is completed at
-         * following places:
-         * - progress engine: complete PUT/ACC req.
-         * - GET/GET_ACC packet handler: complete GET/GET_ACC reqs.
-         *
-         * We always set OnFinal which should be called when sending or
-         * receiving the last segment. However, short put/acc ops are
-         * issued in one packet and the lower layer only check OnDataAvail
-         * so we have to set OnDataAvail as well.
-         *
-         * Note that a noncontig send also uses OnDataAvail to loop all
-         * segments but it must be changed to OnFinal when sending the
-         * last segment, so it is also correct for us.
-         *
-         * TODO: implement stack for overriding functions*/
-        if (op->request->dev.OnDataAvail == NULL) {
-            op->request->dev.OnDataAvail = MPIDI_CH3_ReqHandler_ReqOpsComplete;
+        if (incomplete_req_cnt) {
+            /* Increase ref for completion handler */
+            MPIU_Object_add_ref(op->ureq);
+        }
+        else {
+            /* all requests are completed */
+            /* Complete user request and release ch3 ref */
+            MPID_Request_set_completed(op->ureq);
+            MPID_Request_release(op->ureq);
+            op->ureq = NULL;
         }
-        op->request->dev.OnFinal = MPIDI_CH3_ReqHandler_ReqOpsComplete;
     }
 
   fn_exit:
diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index d0105d8..d145a92 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -47,7 +47,8 @@ static inline MPIDI_RMA_Op_t *MPIDI_CH3I_Win_op_alloc(MPID_Win * win_ptr)
     }
 
     e->dataloop = NULL;
-    e->request = NULL;
+    e->reqs = NULL;
+    e->reqs_size = 0;
     e->ureq = NULL;
     e->is_dt = 0;
     e->piggyback_lock_candidate = 0;
@@ -334,6 +335,7 @@ static inline int MPIDI_CH3I_RMA_Cleanup_ops_target(MPID_Win * win_ptr, MPIDI_RM
     MPIDI_RMA_Op_t **op_list = NULL, **op_list_tail = NULL;
     int read_flag = 0, write_flag = 0;
     int mpi_errno = MPI_SUCCESS;
+    int i;
 
     (*local_completed) = 0;
     (*remote_completed) = 0;
@@ -375,14 +377,27 @@ static inline int MPIDI_CH3I_RMA_Cleanup_ops_target(MPID_Win * win_ptr, MPIDI_RM
 
     curr_op = *op_list;
     while (curr_op != NULL) {
-        if (MPID_Request_is_complete(curr_op->request)) {
-            /* If there's an error, return it */
-            mpi_errno = curr_op->request->status.MPI_ERROR;
-            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
+        for (i = 0; i < curr_op->reqs_size; i++) {
+            if (curr_op->reqs[i] == NULL)
+                continue;
 
-            /* No errors, free the request */
-            MPID_Request_release(curr_op->request);
+            if (MPID_Request_is_complete(curr_op->reqs[i])) {
+                /* If there's an error, return it */
+                mpi_errno = curr_op->reqs[i]->status.MPI_ERROR;
+                MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
 
+                /* No errors, free the request */
+                MPID_Request_release(curr_op->reqs[i]);
+
+                curr_op->reqs[i] = NULL;
+
+                win_ptr->active_req_cnt--;
+            }
+            else
+                break;
+        }
+
+        if (i == curr_op->reqs_size) {
             /* Release user request */
             if (curr_op->ureq) {
                 /* User request must be completed by progress engine */
@@ -392,10 +407,14 @@ static inline int MPIDI_CH3I_RMA_Cleanup_ops_target(MPID_Win * win_ptr, MPIDI_RM
                 MPID_Request_release(curr_op->ureq);
             }
 
+            /* free request array in op struct */
+            MPIU_Free(curr_op->reqs);
+            curr_op->reqs = NULL;
+            curr_op->reqs_size = 0;
+
             /* dequeue the operation and free it */
             MPL_LL_DELETE(*op_list, *op_list_tail, curr_op);
             MPIDI_CH3I_Win_op_free(win_ptr, curr_op);
-            win_ptr->active_req_cnt--;
 
             if (*op_list == NULL) {
                 if (read_flag == 1) {
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index f36a957..ffdc0de 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -60,7 +60,9 @@ typedef struct MPIDI_RMA_Op {
     int result_count;
     MPI_Datatype result_datatype;
 
-    struct MPID_Request *request;
+    struct MPID_Request **reqs;
+    int reqs_size;
+
     MPIDI_RMA_dtype_info dtype_info;
     void *dataloop;
 
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index cf75f99..f82763e 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -585,6 +585,7 @@ static inline int adjust_op_piggybacked_with_lock(MPID_Win * win_ptr,
     MPIDI_RMA_Target_t *target = NULL;
     MPIDI_RMA_Op_t *op = NULL;
     MPIDI_CH3_Pkt_flags_t op_flags = MPIDI_CH3_PKT_FLAG_NONE;
+    int i;
     int mpi_errno = MPI_SUCCESS;
 
     mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, target_rank, &target);
@@ -607,7 +608,8 @@ static inline int adjust_op_piggybacked_with_lock(MPID_Win * win_ptr,
                     MPIU_ERR_POP(mpi_errno);
             }
 
-            if (!op->request) {
+            if (op->reqs_size == 0) {
+                MPIU_Assert(op->reqs == NULL);
                 MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, &(target->pending_op_list),
                                              &(target->pending_op_list_tail), op);
             }
@@ -636,10 +638,19 @@ static inline int adjust_op_piggybacked_with_lock(MPID_Win * win_ptr,
             /* We need to re-transmit this operation, so we destroy
              * the internal request and erase all flags in current
              * operation. */
-            if (op->request) {
-                MPIDI_CH3_Request_destroy(op->request);
-                op->request = NULL;
-                win_ptr->active_req_cnt--;
+            if (op->reqs_size > 0) {
+                MPIU_Assert(op->reqs != NULL);
+                for (i = 0; i < op->reqs_size; i++) {
+                    if (op->reqs[i] != NULL) {
+                        MPIDI_CH3_Request_destroy(op->reqs[i]);
+                        op->reqs[i] = NULL;
+                        win_ptr->active_req_cnt--;
+                    }
+                }
+                /* free req array in this op */
+                MPIU_Free(op->reqs);
+                op->reqs = NULL;
+                op->reqs_size = 0;
             }
             MPIDI_CH3_PKT_RMA_ERASE_FLAGS(op->pkt, mpi_errno);
 
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index 9716b10..64a361f 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -393,7 +393,8 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
                 MPIU_ERR_POP(mpi_errno);
         }
 
-        if (!curr_op->request) {
+        if (curr_op->reqs_size == 0) {
+            MPIU_Assert(curr_op->reqs == NULL);
             /* Sending is completed immediately. */
             MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, &(target->pending_op_list),
                                          &(target->pending_op_list_tail), curr_op);
@@ -560,12 +561,24 @@ int MPIDI_CH3I_RMA_Free_ops_before_completion(MPID_Win * win_ptr)
 
     /* free all ops in the list since we do not need to maintain them anymore */
     for (curr_op = *op_list; curr_op != NULL;) {
-        MPID_Request_release(curr_op->request);
+        if (curr_op->reqs_size > 0) {
+            MPIU_Assert(curr_op->reqs != NULL);
+            for (i = 0; i < curr_op->reqs_size; i++) {
+                if (curr_op->reqs[i] != NULL) {
+                    MPID_Request_release(curr_op->reqs[i]);
+                    curr_op->reqs[i] = NULL;
+                    win_ptr->active_req_cnt--;
+                }
+            }
+
+            /* free req array in this op */
+            MPIU_Free(curr_op->reqs);
+            curr_op->reqs = NULL;
+            curr_op->reqs_size = 0;
+        }
         MPL_LL_DELETE(*op_list, *op_list_tail, curr_op);
         MPIDI_CH3I_Win_op_free(win_ptr, curr_op);
 
-        win_ptr->active_req_cnt--;
-
         if (*op_list == NULL) {
             if (read_flag == 1) {
                 op_list = &curr_target->write_op_list;

http://git.mpich.org/mpich.git/commitdiff/1a3e661f611a69cc59ea45435abfca79a2b59029

commit 1a3e661f611a69cc59ea45435abfca79a2b59029
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:14:46 2015 -0800

    Increment active_req_cnt when issuing the packet.
    
    Increment active_req_cnt when actually issuing the packet
    instead of issuing the operation, since we may cut one
    operation into multiple packets.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index ac82194..32caade 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -375,6 +375,9 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
             MPIU_ERR_POP(mpi_errno);
     }
 
+    if (rma_op->request != NULL)
+        win_ptr->active_req_cnt++;
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_PUT_OP);
     return mpi_errno;
@@ -420,6 +423,9 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
             MPIU_ERR_POP(mpi_errno);
     }
 
+    if (rma_op->request != NULL)
+        win_ptr->active_req_cnt++;
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_ACC_OP);
     return mpi_errno;
@@ -519,6 +525,9 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
     /* For error checking */
     resp_req = NULL;
 
+    if (rma_op->request != NULL)
+        win_ptr->active_req_cnt++;
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_GET_ACC_OP);
     return mpi_errno;
@@ -624,6 +633,9 @@ static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPID_Request_release(req);
     }
 
+    if (rma_op->request != NULL)
+        win_ptr->active_req_cnt++;
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_GET_OP);
     return mpi_errno;
@@ -680,6 +692,9 @@ static int issue_cas_op(MPIDI_RMA_Op_t * rma_op,
         MPID_Request_release(rmw_req);
     }
 
+    if (rma_op->request != NULL)
+        win_ptr->active_req_cnt++;
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_CAS_OP);
     return mpi_errno;
@@ -776,6 +791,9 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
     /* For error checking */
     resp_req = NULL;
 
+    if (rma_op->request != NULL)
+        win_ptr->active_req_cnt++;
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_FOP_OP);
     return mpi_errno;
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index 219aca3..9716b10 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -362,9 +362,6 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
 
         (*made_progress) = 1;
 
-        if (curr_op->request != NULL)
-            win_ptr->active_req_cnt++;
-
         if (curr_op->pkt.type == MPIDI_CH3_PKT_PUT ||
             curr_op->pkt.type == MPIDI_CH3_PKT_PUT_IMMED ||
             curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||

http://git.mpich.org/mpich.git/commitdiff/6c81f6cd6f2a8d5a53fae581a99daaa8824678df

commit 6c81f6cd6f2a8d5a53fae581a99daaa8824678df
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:13:55 2015 -0800

    Return request pointer from issue_from_origin_buffer function.
    
    In the original implementation, issue_from_origin_buffer
    is used to issue out one RMA packet. Since each RMA operation
    only has one packet, it just attaches the returned request
    pointer to the RMA operation structure. Now since we are going
    to cut one RMA operation into multiple stream packets,
    this function will be used to issue each streamed packets,
    and each RMA operation may have multiple requests. Therefore,
    we make this function returns the request pointer and let
    the caller store the request in the request array of op
    structure.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index cb894c9..ac82194 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -183,13 +183,15 @@ static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
 #define FUNCNAME issue_from_origin_buffer
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc)
+static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc,
+                                    MPID_Request ** req_ptr)
 {
     MPI_Aint origin_type_size;
     MPI_Datatype target_datatype;
     MPID_Datatype *target_dtp = NULL, *origin_dtp = NULL;
     int is_origin_contig;
     MPID_IOV iov[MPID_IOV_LIMIT];
+    MPID_Request *req = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER);
 
@@ -230,49 +232,48 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc)
             iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
 
             MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-            mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, iovcnt, &rma_op->request);
+            mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, iovcnt, &req);
             MPIU_THREAD_CS_EXIT(CH3COMM, vc);
             MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
 
             if (origin_dtp != NULL) {
-                if (rma_op->request == NULL) {
+                if (req == NULL) {
                     MPID_Datatype_release(origin_dtp);
                 }
                 else {
                     /* this will cause the datatype to be freed when the request
                      * is freed. */
-                    rma_op->request->dev.datatype_ptr = origin_dtp;
+                    req->dev.datatype_ptr = origin_dtp;
                 }
             }
         }
         else {
             /* derived datatype on origin */
-            rma_op->request = MPID_Request_create();
-            MPIU_ERR_CHKANDJUMP(rma_op->request == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+            req = MPID_Request_create();
+            MPIU_ERR_CHKANDJUMP(req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
 
-            MPIU_Object_set_ref(rma_op->request, 2);
-            rma_op->request->kind = MPID_REQUEST_SEND;
+            MPIU_Object_set_ref(req, 2);
+            req->kind = MPID_REQUEST_SEND;
 
-            rma_op->request->dev.segment_ptr = MPID_Segment_alloc();
-            MPIU_ERR_CHKANDJUMP1(rma_op->request->dev.segment_ptr == NULL, mpi_errno,
+            req->dev.segment_ptr = MPID_Segment_alloc();
+            MPIU_ERR_CHKANDJUMP1(req->dev.segment_ptr == NULL, mpi_errno,
                                  MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
 
             if (origin_dtp != NULL) {
-                rma_op->request->dev.datatype_ptr = origin_dtp;
+                req->dev.datatype_ptr = origin_dtp;
                 /* this will cause the datatype to be freed when the request
                  * is freed. */
             }
             MPID_Segment_init(rma_op->origin_addr, rma_op->origin_count,
-                              rma_op->origin_datatype, rma_op->request->dev.segment_ptr, 0);
-            rma_op->request->dev.segment_first = 0;
-            rma_op->request->dev.segment_size = rma_op->origin_count * origin_type_size;
+                              rma_op->origin_datatype, req->dev.segment_ptr, 0);
+            req->dev.segment_first = 0;
+            req->dev.segment_size = rma_op->origin_count * origin_type_size;
 
-            rma_op->request->dev.OnFinal = 0;
-            rma_op->request->dev.OnDataAvail = 0;
+            req->dev.OnFinal = 0;
+            req->dev.OnDataAvail = 0;
 
             MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-            mpi_errno = vc->sendNoncontig_fn(vc, rma_op->request,
-                                             iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
+            mpi_errno = vc->sendNoncontig_fn(vc, req, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
             MPIU_THREAD_CS_EXIT(CH3COMM, vc);
             MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
         }
@@ -281,16 +282,16 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc)
         /* derived datatype on target */
         MPID_Datatype *combined_dtp = NULL;
 
-        rma_op->request = MPID_Request_create();
-        if (rma_op->request == NULL) {
+        req = MPID_Request_create();
+        if (req == NULL) {
             MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
         }
 
-        MPIU_Object_set_ref(rma_op->request, 2);
-        rma_op->request->kind = MPID_REQUEST_SEND;
+        MPIU_Object_set_ref(req, 2);
+        req->kind = MPID_REQUEST_SEND;
 
-        rma_op->request->dev.segment_ptr = MPID_Segment_alloc();
-        MPIU_ERR_CHKANDJUMP1(rma_op->request->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER,
+        req->dev.segment_ptr = MPID_Segment_alloc();
+        MPIU_ERR_CHKANDJUMP1(req->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER,
                              "**nomem", "**nomem %s", "MPID_Segment_alloc");
 
         /* create a new datatype containing the dtype_info, dataloop, and origin data */
@@ -302,19 +303,18 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc)
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
 
-        rma_op->request->dev.datatype_ptr = combined_dtp;
+        req->dev.datatype_ptr = combined_dtp;
         /* combined_datatype will be freed when request is freed */
 
-        MPID_Segment_init(MPI_BOTTOM, 1, combined_dtp->handle, rma_op->request->dev.segment_ptr, 0);
-        rma_op->request->dev.segment_first = 0;
-        rma_op->request->dev.segment_size = combined_dtp->size;
+        MPID_Segment_init(MPI_BOTTOM, 1, combined_dtp->handle, req->dev.segment_ptr, 0);
+        req->dev.segment_first = 0;
+        req->dev.segment_size = combined_dtp->size;
 
-        rma_op->request->dev.OnFinal = 0;
-        rma_op->request->dev.OnDataAvail = 0;
+        req->dev.OnFinal = 0;
+        req->dev.OnDataAvail = 0;
 
         MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = vc->sendNoncontig_fn(vc, rma_op->request,
-                                         iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
+        mpi_errno = vc->sendNoncontig_fn(vc, req, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
         MPIU_THREAD_CS_EXIT(CH3COMM, vc);
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
 
@@ -324,16 +324,18 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc)
         MPID_Datatype_release(target_dtp);
     }
 
+    (*req_ptr) = req;
+
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER);
     return mpi_errno;
   fn_fail:
-    if (rma_op->request) {
-        if (rma_op->request->dev.datatype_ptr)
-            MPID_Datatype_release(rma_op->request->dev.datatype_ptr);
-        MPID_Request_release(rma_op->request);
+    if ((*req_ptr)) {
+        if ((*req_ptr)->dev.datatype_ptr)
+            MPID_Datatype_release((*req_ptr)->dev.datatype_ptr);
+        MPID_Request_release((*req_ptr));
     }
-    rma_op->request = NULL;
+    (*req_ptr) = NULL;
     goto fn_exit;
 }
 
@@ -368,7 +370,7 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        mpi_errno = issue_from_origin_buffer(rma_op, vc);
+        mpi_errno = issue_from_origin_buffer(rma_op, vc, &(rma_op->request));
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -413,7 +415,7 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        mpi_errno = issue_from_origin_buffer(rma_op, vc);
+        mpi_errno = issue_from_origin_buffer(rma_op, vc, &(rma_op->request));
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -483,7 +485,7 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        mpi_errno = issue_from_origin_buffer(rma_op, vc);
+        mpi_errno = issue_from_origin_buffer(rma_op, vc, &(rma_op->request));
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -741,7 +743,7 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        mpi_errno = issue_from_origin_buffer(rma_op, vc);
+        mpi_errno = issue_from_origin_buffer(rma_op, vc, &(rma_op->request));
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }

http://git.mpich.org/mpich.git/commitdiff/9fa6582a0887a3242f9cee494dd95949e71abc90

commit 9fa6582a0887a3242f9cee494dd95949e71abc90
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:12:40 2015 -0800

    Code refactoring: set ureq to NULL when creating op.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 5085978..d0105d8 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -48,6 +48,7 @@ static inline MPIDI_RMA_Op_t *MPIDI_CH3I_Win_op_alloc(MPID_Win * win_ptr)
 
     e->dataloop = NULL;
     e->request = NULL;
+    e->ureq = NULL;
     e->is_dt = 0;
     e->piggyback_lock_candidate = 0;
 
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index d723cc1..bbf3d80 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -140,7 +140,6 @@ int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
         new_ptr->origin_count = origin_count;
         new_ptr->origin_datatype = origin_datatype;
         new_ptr->target_rank = target_rank;
-        new_ptr->ureq = NULL;   /* reset user request */
 
         /* Remember user request */
         if (ureq) {
@@ -338,7 +337,6 @@ int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
         new_ptr->origin_count = origin_count;
         new_ptr->origin_datatype = origin_datatype;
         new_ptr->target_rank = target_rank;
-        new_ptr->ureq = NULL;   /* reset user request */
 
         /* Remember user request */
         if (ureq) {
@@ -524,7 +522,6 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
         new_ptr->origin_count = origin_count;
         new_ptr->origin_datatype = origin_datatype;
         new_ptr->target_rank = target_rank;
-        new_ptr->ureq = NULL;   /* reset user request */
 
         /* Remember user request */
         if (ureq) {
@@ -732,7 +729,6 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             new_ptr->origin_count = result_count;
             new_ptr->origin_datatype = result_datatype;
             new_ptr->target_rank = target_rank;
-            new_ptr->ureq = NULL;       /* reset user request */
 
             /* Remember user request */
             if (ureq) {
@@ -804,7 +800,6 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             new_ptr->result_count = result_count;
             new_ptr->result_datatype = result_datatype;
             new_ptr->target_rank = target_rank;
-            new_ptr->ureq = NULL;       /* reset user request */
 
             /* Remember user request */
             if (ureq) {
@@ -1114,7 +1109,6 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         new_ptr->compare_datatype = datatype;
         new_ptr->target_rank = target_rank;
         new_ptr->piggyback_lock_candidate = 1;  /* CAS is always able to piggyback LOCK */
-        new_ptr->ureq = NULL;   /* reset user request */
 
         /************** Setting packet struct areas in operation ****************/
 
@@ -1261,7 +1255,6 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             new_ptr->origin_datatype = datatype;
             new_ptr->target_rank = target_rank;
             new_ptr->piggyback_lock_candidate = 1;
-            new_ptr->ureq = NULL;       /* reset user request */
 
             /************** Setting packet struct areas in operation ****************/
 
@@ -1307,7 +1300,6 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             new_ptr->result_datatype = datatype;
             new_ptr->target_rank = target_rank;
             new_ptr->piggyback_lock_candidate = 1;
-            new_ptr->ureq = NULL;       /* reset user request */
 
             /************** Setting packet struct areas in operation ****************/
 

http://git.mpich.org/mpich.git/commitdiff/a36fd9dd81c4fae3f4aaf1b8f8c897131381ec24

commit a36fd9dd81c4fae3f4aaf1b8f8c897131381ec24
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:11:31 2015 -0800

    Code refactoring: setting ureq after issuing op in a function.
    
    After (1) issuing an op (no LOCK flag), or (2) issuing an op
    (with LOCK flag) and receiving an ACK that LOCK is granted or
    queued, we should set the user request (ureq) to be completed.
    This patch wraps up the work of setting ureq into a function,
    and call that function after (1) and (2) happens.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index 62c7868..cb894c9 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -840,4 +840,65 @@ static inline int issue_rma_op(MPIDI_RMA_Op_t * op_ptr, MPID_Win * win_ptr,
     /* --END ERROR HANDLING-- */
 }
 
+#undef FUNCNAME
+#define FUNCNAME set_user_req_after_issuing_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int set_user_req_after_issuing_op(MPIDI_RMA_Op_t * op)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_SET_USER_REQ_AFTER_ISSUING_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_SET_USER_REQ_AFTER_ISSUING_OP);
+
+    if (op->ureq == NULL)
+        goto fn_exit;
+
+    if (!op->request) {
+        /* Sending is completed immediately, complete user request
+         * and release ch3 ref. */
+
+        /* Complete user request and release the ch3 ref */
+        MPID_Request_set_completed(op->ureq);
+        MPID_Request_release(op->ureq);
+    }
+    else {
+        /* Sending is not completed immediately. */
+
+        /* Setup user request info in order to be completed following send request. */
+
+        /* Increase ref for completion handler */
+        MPIU_Object_add_ref(op->ureq);
+        op->request->dev.request_handle = op->ureq->handle;
+
+        /* Setup user request completion handler.
+         *
+         * The handler is triggered when send request is completed at
+         * following places:
+         * - progress engine: complete PUT/ACC req.
+         * - GET/GET_ACC packet handler: complete GET/GET_ACC reqs.
+         *
+         * We always set OnFinal which should be called when sending or
+         * receiving the last segment. However, short put/acc ops are
+         * issued in one packet and the lower layer only check OnDataAvail
+         * so we have to set OnDataAvail as well.
+         *
+         * Note that a noncontig send also uses OnDataAvail to loop all
+         * segments but it must be changed to OnFinal when sending the
+         * last segment, so it is also correct for us.
+         *
+         * TODO: implement stack for overriding functions*/
+        if (op->request->dev.OnDataAvail == NULL) {
+            op->request->dev.OnDataAvail = MPIDI_CH3_ReqHandler_ReqOpsComplete;
+        }
+        op->request->dev.OnFinal = MPIDI_CH3_ReqHandler_ReqOpsComplete;
+    }
+
+  fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_SET_USER_REQ_AFTER_ISSUING_OP);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
 #endif /* MPID_RMA_ISSUE_H_INCLUDED */
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index d5e9343..cf75f99 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -601,13 +601,13 @@ static inline int adjust_op_piggybacked_with_lock(MPID_Win * win_ptr,
         if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED ||
             flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_QUEUED_DATA_QUEUED) {
 
-            if (!op->request) {
-                if (op->ureq) {
-                    /* Complete user request and release the ch3 ref */
-                    MPID_Request_set_completed(op->ureq);
-                    MPID_Request_release(op->ureq);
-                }
+            if (op->ureq != NULL) {
+                mpi_errno = set_user_req_after_issuing_op(op);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
+            }
 
+            if (!op->request) {
                 MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, &(target->pending_op_list),
                                              &(target->pending_op_list_tail), op);
             }
@@ -629,23 +629,6 @@ static inline int adjust_op_piggybacked_with_lock(MPID_Win * win_ptr,
                     MPIDI_CH3I_RMA_Ops_append(&(target->read_op_list),
                                               &(target->read_op_list_tail), op);
                 }
-
-                if (op->ureq) {
-                    if (MPID_Request_is_complete(op->request)) {
-                        /* Complete user request, let cleanup function to release
-                         * ch3 ref */
-                        MPID_Request_set_completed(op->ureq);
-                    }
-                    else {
-                        /* Increase ref for completion handler */
-                        MPIU_Object_add_ref(op->ureq);
-                        op->request->dev.request_handle = op->ureq->handle;
-                        if (op->request->dev.OnDataAvail == NULL) {
-                            op->request->dev.OnDataAvail = MPIDI_CH3_ReqHandler_ReqOpsComplete;
-                        }
-                        op->request->dev.OnFinal = MPIDI_CH3_ReqHandler_ReqOpsComplete;
-                    }
-                }
             }
         }
         else if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_QUEUED_DATA_DISCARDED ||
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index 0c0f9a0..219aca3 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -390,13 +390,13 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
             break;
         }
 
-        if (!curr_op->request) {
-            if (curr_op->ureq) {
-                /* Complete user request and release the ch3 ref */
-                MPID_Request_set_completed(curr_op->ureq);
-                MPID_Request_release(curr_op->ureq);
-            }
+        if (curr_op->ureq != NULL) {
+            mpi_errno = set_user_req_after_issuing_op(curr_op);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
 
+        if (!curr_op->request) {
             /* Sending is completed immediately. */
             MPIDI_CH3I_RMA_Ops_free_elem(win_ptr, &(target->pending_op_list),
                                          &(target->pending_op_list_tail), curr_op);
@@ -421,35 +421,6 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
                 MPIDI_CH3I_RMA_Ops_append(&(target->read_op_list),
                                           &(target->read_op_list_tail), curr_op);
             }
-
-            /* Setup user request info in order to be completed following send request. */
-            if (curr_op->ureq) {
-                /* Increase ref for completion handler */
-                MPIU_Object_add_ref(curr_op->ureq);
-                curr_op->request->dev.request_handle = curr_op->ureq->handle;
-
-                /* Setup user request completion handler.
-                 *
-                 * The handler is triggered when send request is completed at
-                 * following places:
-                 * - progress engine: complete PUT/ACC req.
-                 * - GET/GET_ACC packet handler: complete GET/GET_ACC reqs.
-                 *
-                 * We always set OnFinal which should be called when sending or
-                 * receiving the last segment. However, short put/acc ops are
-                 * issued in one packet and the lower layer only check OnDataAvail
-                 * so we have to set OnDataAvail as well.
-                 *
-                 * Note that a noncontig send also uses OnDataAvail to loop all
-                 * segments but it must be changed to OnFinal when sending the
-                 * last segment, so it is also correct for us.
-                 *
-                 * TODO: implement stack for overriding functions*/
-                if (curr_op->request->dev.OnDataAvail == NULL) {
-                    curr_op->request->dev.OnDataAvail = MPIDI_CH3_ReqHandler_ReqOpsComplete;
-                }
-                curr_op->request->dev.OnFinal = MPIDI_CH3_ReqHandler_ReqOpsComplete;
-            }
         }
 
         curr_op = target->next_op_to_issue;

http://git.mpich.org/mpich.git/commitdiff/fd92b7bc82dbb1bbfc06a8f43d882067a37f6796

commit fd92b7bc82dbb1bbfc06a8f43d882067a37f6796
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:09:29 2015 -0800

    Modify location of setting next_op_to_issue and sync_flag to NONE
    
    After we issue an op, we set the next_op_to_issue to the next op,
    and if next op is NULL, we set sync_flag to NONE. When we receive
    the lock ACK saying that lock request is discarded, we set the
    next_op_to_issue back to the current op, we reset the sync_flag
    from NONE to corresponding flag, since we need to re-transmit the
    current op.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index fc0b10b..d5e9343 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -600,6 +600,7 @@ static inline int adjust_op_piggybacked_with_lock(MPID_Win * win_ptr,
         op_flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
         if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED ||
             flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_QUEUED_DATA_QUEUED) {
+
             if (!op->request) {
                 if (op->ureq) {
                     /* Complete user request and release the ch3 ref */
@@ -660,6 +661,10 @@ static inline int adjust_op_piggybacked_with_lock(MPID_Win * win_ptr,
             MPIDI_CH3_PKT_RMA_ERASE_FLAGS(op->pkt, mpi_errno);
 
             target->next_op_to_issue = op;
+            if (op_flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH)
+                target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
+            else if (op_flags & MPIDI_RMA_SYNC_UNLOCK)
+                target->sync.sync_flag = MPIDI_RMA_SYNC_UNLOCK;
         }
     }
 
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index 96deee8..0c0f9a0 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -354,13 +354,8 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
             else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
                 flags |= MPIDI_CH3_PKT_FLAG_RMA_UNLOCK;
             }
-
-            /* We are done with ending sync, unset target's sync_flag. */
-            target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;
         }
 
-        target->next_op_to_issue = curr_op->next;
-
         mpi_errno = issue_rma_op(curr_op, win_ptr, target, flags);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
@@ -378,6 +373,14 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * targ
                                          * PUT/ACC operation. */
         }
 
+        target->next_op_to_issue = curr_op->next;
+        if (target->next_op_to_issue == NULL) {
+            if (flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH || flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
+                /* We are done with ending sync, unset target's sync_flag. */
+                target->sync.sync_flag = MPIDI_RMA_SYNC_NONE;
+            }
+        }
+
         if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
             flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
             /* If this operation is piggybacked with LOCK,

http://git.mpich.org/mpich.git/commitdiff/a3af53c3bc848159c5e450419516746279322827

commit a3af53c3bc848159c5e450419516746279322827
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:08:20 2015 -0800

    Code-refactoring: make perform_get_acc_in_lock_queue cleaner.
    
    This patch does not change any functionality but just makes the
    code structure cleaner.
    
    The original code structure of perform_get_acc_in_lock_queue is
    a mess since the code of dealing with IMMED packet type and the
    code of dealing with normal packet type are mixed together.
    This patch separates these two parts and makes the function looks
    cleaner.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 32d8f40..0dd1db2 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -1122,22 +1122,26 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
 
     /* Copy data into a temporary buffer */
     MPID_Datatype_get_size_macro(get_accum_pkt->datatype, type_size);
-    if (get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM)
-        sreq->dev.user_buf = (void *) MPIU_Malloc(get_accum_pkt->count * type_size);
-    else {
-        MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED);
-    }
-
-    MPID_Datatype_is_contig(get_accum_pkt->datatype, &is_contig);
 
     /* length of target data */
     MPIU_Assign_trunc(len, get_accum_pkt->count * type_size, size_t);
 
-    /* Perform ACCUMULATE OP */
-    if (win_ptr->shm_allocated == TRUE)
-        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-
     if (get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {
+        MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED);
+        get_accum_resp_pkt->request_handle = get_accum_pkt->request_handle;
+        get_accum_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+        if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
+            get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE)
+            get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
+        if ((get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
+            (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
+            get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+        get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
+
+        /* Perform ACCUMULATE OP */
+        if (win_ptr->shm_allocated == TRUE)
+            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+
         void *src = (void *) (get_accum_pkt->addr), *dest =
             (void *) (get_accum_resp_pkt->info.data);
         mpi_errno = immed_copy(src, dest, len);
@@ -1146,8 +1150,47 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
                 MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
             MPIU_ERR_POP(mpi_errno);
         }
+
+        /* All data fits in packet header */
+        mpi_errno = do_accumulate_op(get_accum_pkt->info.data, get_accum_pkt->addr,
+                                     get_accum_pkt->count, get_accum_pkt->datatype,
+                                     get_accum_pkt->op);
+
+        if (win_ptr->shm_allocated == TRUE)
+            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+
+        /* here we increment the Active Target counter to guarantee the GET-like
+         * operation are completed when counter reaches zero. */
+        win_ptr->at_completion_counter++;
+
+        /* All origin data is in packet header, issue the header. */
+        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
+        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
+        iovcnt = 1;
+
+        mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, sreq, iov, iovcnt);
+        if (mpi_errno != MPI_SUCCESS) {
+            MPID_Request_release(sreq);
+            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+        }
+
+        goto fn_exit;
     }
-    else if (is_contig) {
+
+    MPIU_Assert(get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM);
+
+    sreq->dev.user_buf = (void *) MPIU_Malloc(get_accum_pkt->count * type_size);
+
+    MPID_Datatype_is_contig(get_accum_pkt->datatype, &is_contig);
+
+    /* Perform ACCUMULATE OP */
+    if (win_ptr->shm_allocated == TRUE)
+        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+
+    if (is_contig) {
         MPIU_Memcpy(sreq->dev.user_buf, get_accum_pkt->addr, get_accum_pkt->count * type_size);
     }
     else {
@@ -1166,19 +1209,8 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
         MPID_Segment_free(seg);
     }
 
-    if (get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {
-        /* All data fits in packet header */
-        mpi_errno = do_accumulate_op(get_accum_pkt->info.data, get_accum_pkt->addr,
-                                     get_accum_pkt->count, get_accum_pkt->datatype,
-                                     get_accum_pkt->op);
-    }
-    else {
-        MPIU_Assert(get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM);
-
-        mpi_errno = do_accumulate_op(lock_entry->data, get_accum_pkt->addr,
-                                     get_accum_pkt->count, get_accum_pkt->datatype,
-                                     get_accum_pkt->op);
-    }
+    mpi_errno = do_accumulate_op(lock_entry->data, get_accum_pkt->addr,
+                                 get_accum_pkt->count, get_accum_pkt->datatype, get_accum_pkt->op);
 
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
@@ -1190,9 +1222,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
      * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter++;
 
-    if (get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM) {
-        MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
-    }
+    MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
     get_accum_resp_pkt->request_handle = get_accum_pkt->request_handle;
     get_accum_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
     if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
@@ -1203,19 +1233,11 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
         get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
     get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
 
-    if (get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {
-        /* All origin data is in packet header, issue the header. */
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
-        iovcnt = 1;
-    }
-    else {
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) sreq->dev.user_buf);
-        iov[1].MPID_IOV_LEN = get_accum_pkt->count * type_size;
-        iovcnt = 2;
-    }
+    iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
+    iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
+    iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) sreq->dev.user_buf);
+    iov[1].MPID_IOV_LEN = get_accum_pkt->count * type_size;
+    iovcnt = 2;
 
     mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, sreq, iov, iovcnt);
     if (mpi_errno != MPI_SUCCESS) {

http://git.mpich.org/mpich.git/commitdiff/45cdb28256eaf9dfd561c8088e4e87dbd9c7ec16

commit 45cdb28256eaf9dfd561c8088e4e87dbd9c7ec16
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:07:21 2015 -0800

    Change name from data_size to buf_size.
    
    When the lock is not satisfied, we queue up
    the lock request and op data in a lock entry
    queue. In the struct of lock entry, we use 'data_size'
    to remember the size of buffer for storing the
    data. Since the size of buffer is not type_size*count
    but might be type_extent*extent, here we change
    its name from 'data_size' to 'buf_size'.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_lockqueue.h b/src/mpid/ch3/include/mpid_rma_lockqueue.h
index dcc2d62..3fab2e4 100644
--- a/src/mpid/ch3/include/mpid_rma_lockqueue.h
+++ b/src/mpid/ch3/include/mpid_rma_lockqueue.h
@@ -34,7 +34,7 @@ static inline MPIDI_RMA_Lock_entry_t *MPIDI_CH3I_Win_lock_entry_alloc(MPID_Win *
         MPIU_Memcpy(&(new_ptr->pkt), pkt, sizeof(*pkt));
         new_ptr->vc = NULL;
         new_ptr->data = NULL;
-        new_ptr->data_size = 0;
+        new_ptr->buf_size = 0;
         new_ptr->all_data_recved = 0;
     }
 
@@ -53,7 +53,7 @@ static inline int MPIDI_CH3I_Win_lock_entry_free(MPID_Win * win_ptr,
     int mpi_errno = MPI_SUCCESS;
 
     if (lock_entry->data != NULL) {
-        win_ptr->current_lock_data_bytes -= lock_entry->data_size;
+        win_ptr->current_lock_data_bytes -= lock_entry->buf_size;
         MPIU_Free(lock_entry->data);
     }
 
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index 73772e4..f36a957 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -133,7 +133,7 @@ typedef struct MPIDI_RMA_Lock_entry {
     MPIDI_CH3_Pkt_t pkt;        /* all information for this request packet */
     MPIDI_VC_t *vc;
     void *data;                 /* for queued PUTs / ACCs / GACCs, data is copied here */
-    int data_size;
+    int buf_size;
     int all_data_recved;        /* indicate if all data has been received */
 } MPIDI_RMA_Lock_entry_t;
 
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index f42cd64..fc0b10b 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -418,7 +418,7 @@ static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
             }
             else {
                 win_ptr->current_lock_data_bytes += buf_size;
-                new_ptr->data_size = buf_size;
+                new_ptr->buf_size = buf_size;
             }
         }
 

http://git.mpich.org/mpich.git/commitdiff/ce8bc3105907988579ed548cd02c306a9dd345e7

commit ce8bc3105907988579ed548cd02c306a9dd345e7
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 19:10:40 2015 -0600

    Bug-fix: make RMA work correctly with pair basic type.
    
    The original implementation of RMA does not consider pair basic
    types (e.g. MPI_FLOAT_INT, MPI_DOUBLE_INT). It only
    works correctly with builtin datatypes (e.g. MPI_INT, MPI_FLOAT).
    This patch makes the RMA work correctly with pair basic types.
    
    The bug is that: (1) when performing the ACC computation, the original
    implementation uses 'eltype' in the datatype structure, which is set
    when all basic elements in this datatype have the same builtin
    datatype. When basic elements have different builtin datatypes, like
    pair datatypes, the 'eltype' is set to MPI_DATATYPE_NULL. This makes
    the ACC computation be unable to work with pair types; (2) for all
    basic type of data, the original implementation assumes that
    they are all contiguous and issues them in an unpacked manner
    with length of data size (count*type_size). This is incorrect for
    pair datatypes, because most pair datatypes are non-contiguous
    (type_extent != type_size).
    
    In the previous patch, we already made 'eltype' to store basic
    type instead of builtin type. In this patch, we fixed this
    bug by (1) modify ACC computation to treat 'eltype' as basic
    type; (2) For non-contiguous basic type data, we use the noncontig
    API so that it will be issued in a packed manner.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index 2ba4274..62c7868 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -188,6 +188,7 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc)
     MPI_Aint origin_type_size;
     MPI_Datatype target_datatype;
     MPID_Datatype *target_dtp = NULL, *origin_dtp = NULL;
+    int is_origin_contig;
     MPID_IOV iov[MPID_IOV_LIMIT];
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER);
@@ -214,13 +215,14 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc)
     }
 
     MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
+    MPID_Datatype_is_contig(rma_op->origin_datatype, &is_origin_contig);
 
     iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) & (rma_op->pkt);
     iov[0].MPID_IOV_LEN = sizeof(rma_op->pkt);
 
     if (target_dtp == NULL) {
         /* basic datatype on target */
-        if (origin_dtp == NULL) {
+        if (is_origin_contig) {
             /* basic datatype on origin */
             int iovcnt = 2;
 
@@ -231,6 +233,17 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc)
             mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, iovcnt, &rma_op->request);
             MPIU_THREAD_CS_EXIT(CH3COMM, vc);
             MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+            if (origin_dtp != NULL) {
+                if (rma_op->request == NULL) {
+                    MPID_Datatype_release(origin_dtp);
+                }
+                else {
+                    /* this will cause the datatype to be freed when the request
+                     * is freed. */
+                    rma_op->request->dev.datatype_ptr = origin_dtp;
+                }
+            }
         }
         else {
             /* derived datatype on origin */
@@ -244,9 +257,11 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc)
             MPIU_ERR_CHKANDJUMP1(rma_op->request->dev.segment_ptr == NULL, mpi_errno,
                                  MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
 
-            rma_op->request->dev.datatype_ptr = origin_dtp;
-            /* this will cause the datatype to be freed when the request
-             * is freed. */
+            if (origin_dtp != NULL) {
+                rma_op->request->dev.datatype_ptr = origin_dtp;
+                /* this will cause the datatype to be freed when the request
+                 * is freed. */
+            }
             MPID_Segment_init(rma_op->origin_addr, rma_op->origin_count,
                               rma_op->origin_datatype, rma_op->request->dev.segment_ptr, 0);
             rma_op->request->dev.segment_first = 0;
diff --git a/src/mpid/ch3/include/mpid_rma_shm.h b/src/mpid/ch3/include/mpid_rma_shm.h
index 4355cc7..f1ef81f 100644
--- a/src/mpid/ch3/include/mpid_rma_shm.h
+++ b/src/mpid/ch3/include/mpid_rma_shm.h
@@ -355,10 +355,13 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
         DLOOP_VECTOR *dloop_vec;
         MPI_Aint first, last;
         int vec_len, i, type_size, count;
+        MPI_Aint type_extent;
         MPI_Datatype type;
         MPI_Aint true_lb, true_extent, extent;
         void *tmp_buf = NULL, *target_buf;
         const void *source_buf;
+        MPI_Aint curr_len;
+        void *curr_loc;
 
         if (origin_datatype != target_datatype) {
             /* first copy the data into a temporary buffer with
@@ -411,16 +414,44 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
             source_buf = (tmp_buf != NULL) ? (const void *) tmp_buf : origin_addr;
             target_buf = (char *) base + disp_unit * target_disp;
             type = dtp->eltype;
-            type_size = MPID_Datatype_get_basic_size(type);
+
+            MPIU_Assert(type != MPI_DATATYPE_NULL);
+
+            MPID_Datatype_get_size_macro(type, type_size);
+            MPID_Datatype_get_extent_macro(type, type_extent);
+
             if (shm_op)
                 MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-            for (i = 0; i < vec_len; i++) {
-                MPIU_Assign_trunc(count, (dloop_vec[i].DLOOP_VECTOR_LEN) / type_size, int);
 
-                (*uop) ((char *) source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
-                        (char *) target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
-                        &count, &type);
+            i = 0;
+            curr_loc = dloop_vec[0].DLOOP_VECTOR_BUF;
+            curr_len = dloop_vec[0].DLOOP_VECTOR_LEN;
+            while (i != vec_len) {
+                if (curr_len < type_size) {
+                    MPIU_Assert(i != vec_len);
+                    i++;
+                    curr_len += dloop_vec[i].DLOOP_VECTOR_LEN;
+                    continue;
+                }
+
+                MPIU_Assign_trunc(count, curr_len/type_size, int);
+                (*uop)((char *)source_buf + MPIU_PtrToAint(curr_loc),
+                       (char *)target_buf + MPIU_PtrToAint(curr_loc),
+                       &count, &type);
+
+                if (curr_len % type_size == 0) {
+                    i++;
+                    if (i != vec_len) {
+                        curr_loc = dloop_vec[i].DLOOP_VECTOR_BUF;
+                        curr_len = dloop_vec[i].DLOOP_VECTOR_LEN;
+                    }
+                }
+                else {
+                    curr_loc = (void *)((char *)curr_loc + type_extent * count);
+                    curr_len -= type_size * count;
+                }
             }
+
             if (shm_op)
                 MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 
@@ -533,6 +564,9 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
         MPI_Aint true_lb, true_extent, extent;
         void *tmp_buf = NULL, *target_buf;
         const void *source_buf;
+        MPI_Aint type_extent;
+        MPI_Aint curr_len;
+        void *curr_loc;
 
         if (origin_datatype != target_datatype) {
             /* first copy the data into a temporary buffer with
@@ -581,13 +615,39 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
             source_buf = (tmp_buf != NULL) ? (const void *) tmp_buf : origin_addr;
             target_buf = (char *) base + disp_unit * target_disp;
             type = dtp->eltype;
-            type_size = MPID_Datatype_get_basic_size(type);
 
-            for (i = 0; i < vec_len; i++) {
-                MPIU_Assign_trunc(count, (dloop_vec[i].DLOOP_VECTOR_LEN) / type_size, int);
-                (*uop) ((char *) source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
-                        (char *) target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
-                        &count, &type);
+            MPIU_Assert(type != MPI_DATATYPE_NULL);
+
+            MPID_Datatype_get_size_macro(type, type_size);
+            MPID_Datatype_get_extent_macro(type, type_extent);
+
+            i = 0;
+            curr_loc = dloop_vec[0].DLOOP_VECTOR_BUF;
+            curr_len = dloop_vec[0].DLOOP_VECTOR_LEN;
+            while (i != vec_len) {
+                if (curr_len < type_size) {
+                    MPIU_Assert(i != vec_len);
+                    i++;
+                    curr_len += dloop_vec[i].DLOOP_VECTOR_LEN;
+                    continue;
+                }
+
+                MPIU_Assign_trunc(count, curr_len/type_size, int);
+                (*uop)((char *)source_buf + MPIU_PtrToAint(curr_loc),
+                       (char *)target_buf + MPIU_PtrToAint(curr_loc),
+                       &count, &type);
+
+                if (curr_len % type_size == 0) {
+                    i++;
+                    if (i != vec_len) {
+                        curr_loc = dloop_vec[i].DLOOP_VECTOR_BUF;
+                        curr_len = dloop_vec[i].DLOOP_VECTOR_LEN;
+                    }
+                }
+                else {
+                    curr_loc = (void *)((char *)curr_loc + type_extent * count);
+                    curr_len -= type_size * count;
+                }
             }
 
             MPID_Segment_free(segp);
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index e689751..f42cd64 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -355,7 +355,9 @@ static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
     }
     else {
         MPI_Aint type_size = 0;
+        MPI_Aint type_extent;
         MPIDI_msg_sz_t recv_data_sz = 0;
+        MPIDI_msg_sz_t buf_size;
         MPID_Request *req = NULL;
         MPI_Datatype target_dtp;
         int target_count;
@@ -368,18 +370,20 @@ static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
         MPIDI_CH3_PKT_RMA_GET_TARGET_DATATYPE((*pkt), target_dtp, mpi_errno);
         MPIDI_CH3_PKT_RMA_GET_TARGET_COUNT((*pkt), target_count, mpi_errno);
 
+        MPID_Datatype_get_extent_macro(target_dtp, type_extent);
         MPID_Datatype_get_size_macro(target_dtp, type_size);
         recv_data_sz = type_size * target_count;
+        buf_size = type_extent * target_count;
 
         if (new_ptr != NULL) {
-            if (win_ptr->current_lock_data_bytes + recv_data_sz < MPIR_CVAR_CH3_RMA_LOCK_DATA_BYTES) {
-                new_ptr->data = MPIU_Malloc(recv_data_sz);
+            if (win_ptr->current_lock_data_bytes + buf_size < MPIR_CVAR_CH3_RMA_LOCK_DATA_BYTES) {
+                new_ptr->data = MPIU_Malloc(buf_size);
             }
 
             if (new_ptr->data == NULL) {
                 /* Note that there are two possible reasons to make new_ptr->data to be NULL:
-                 * (1) win_ptr->current_lock_data_bytes + recv_data_sz >= MPIR_CVAR_CH3_RMA_LOCK_DATA_BYTES;
-                 * (2) MPIU_Malloc(recv_data_sz) failed.
+                 * (1) win_ptr->current_lock_data_bytes + buf_size >= MPIR_CVAR_CH3_RMA_LOCK_DATA_BYTES;
+                 * (2) MPIU_Malloc(buf_size) failed.
                  * In such cases, we cannot allocate memory for lock data, so we give up
                  * buffering lock data, however, we still buffer lock request.
                  */
@@ -413,8 +417,8 @@ static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
                 data_discarded = 1;
             }
             else {
-                win_ptr->current_lock_data_bytes += recv_data_sz;
-                new_ptr->data_size = recv_data_sz;
+                win_ptr->current_lock_data_bytes += buf_size;
+                new_ptr->data_size = buf_size;
             }
         }
 
@@ -795,9 +799,11 @@ static inline int do_accumulate_op(void *source_buf, void *target_buf,
         DLOOP_VECTOR *dloop_vec;
         MPI_Aint first, last;
         int vec_len, i, count;
-        MPI_Aint type_size;
+        MPI_Aint type_extent, type_size;
         MPI_Datatype type;
         MPID_Datatype *dtp;
+        MPI_Aint curr_len;
+        void *curr_loc;
 
         segp = MPID_Segment_alloc();
         /* --BEGIN ERROR HANDLING-- */
@@ -831,12 +837,37 @@ static inline int do_accumulate_op(void *source_buf, void *target_buf,
         MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);
 
         type = dtp->eltype;
+        MPIU_Assert(type != MPI_DATATYPE_NULL);
+
         MPID_Datatype_get_size_macro(type, type_size);
-        for (i = 0; i < vec_len; i++) {
-            MPIU_Assign_trunc(count, (dloop_vec[i].DLOOP_VECTOR_LEN) / type_size, int);
-            (*uop) ((char *) source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
-                    (char *) target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
-                    &count, &type);
+        MPID_Datatype_get_extent_macro(type, type_extent);
+
+        i = 0;
+        curr_loc = dloop_vec[0].DLOOP_VECTOR_BUF;
+        curr_len = dloop_vec[0].DLOOP_VECTOR_LEN;
+        while (i != vec_len) {
+            if (curr_len < type_size) {
+                MPIU_Assert(i != vec_len);
+                i++;
+                curr_len += dloop_vec[i].DLOOP_VECTOR_LEN;
+                continue;
+            }
+
+            MPIU_Assign_trunc(count, curr_len / type_size, int);
+            (*uop) ((char *) source_buf + MPIU_PtrToAint(curr_loc),
+                    (char *) target_buf + MPIU_PtrToAint(curr_loc), &count, &type);
+
+            if (curr_len % type_size == 0) {
+                i++;
+                if (i != vec_len) {
+                    curr_loc = dloop_vec[i].DLOOP_VECTOR_BUF;
+                    curr_len = dloop_vec[i].DLOOP_VECTOR_LEN;
+                }
+            }
+            else {
+                curr_loc = (void *) ((char *) curr_loc + type_extent * count);
+                curr_len -= type_size * count;
+            }
         }
 
         MPID_Segment_free(segp);
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 8ad48fd..32d8f40 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -219,6 +219,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     MPID_IOV iov[MPID_IOV_LIMIT];
     MPI_Aint true_lb, true_extent;
     int iovcnt;
+    int is_contig;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);
 
@@ -239,6 +240,8 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
 
     MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
 
+    MPID_Datatype_is_contig(rreq->dev.datatype, &is_contig);
+
     /* Copy data into a temporary buffer */
     resp_req = MPID_Request_create();
     MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
@@ -250,7 +253,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 
-    if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype)) {
+    if (is_contig) {
         MPIU_Memcpy(resp_req->dev.user_buf, rreq->dev.real_user_buf,
                     rreq->dev.user_count * type_size);
     }
@@ -340,6 +343,7 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, i
     int iovcnt;
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_fop_resp_t *fop_resp_pkt = &upkt.fop_resp;
+    int is_contig;
     MPIU_CHKPMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_FOPRECVCOMPLETE);
 
@@ -349,6 +353,8 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, i
 
     MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
 
+    MPID_Datatype_is_contig(rreq->dev.datatype, &is_contig);
+
     /* Create response request */
     resp_req = MPID_Request_create();
     MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
@@ -368,7 +374,23 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, i
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 
     /* Copy data into a temporary buffer in response request */
-    MPIU_Memcpy(resp_req->dev.user_buf, rreq->dev.real_user_buf, type_size);
+    if (is_contig) {
+        MPIU_Memcpy(resp_req->dev.user_buf, rreq->dev.real_user_buf, type_size);
+    }
+    else {
+        MPID_Segment *seg = MPID_Segment_alloc();
+        MPI_Aint last = type_size;
+
+        if (seg == NULL) {
+            if (win_ptr->shm_allocated == TRUE)
+                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+        }
+        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
+                             "MPID_Segment");
+        MPID_Segment_init(rreq->dev.real_user_buf, 1, rreq->dev.datatype, seg, 0);
+        MPID_Segment_pack(seg, 0, &last, resp_req->dev.user_buf);
+        MPID_Segment_free(seg);
+    }
 
     /* Perform accumulate computation */
     if (rreq->dev.op != MPI_NO_OP) {
@@ -917,6 +939,7 @@ static inline int perform_get_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     size_t len;
     int iovcnt;
     MPID_IOV iov[MPID_IOV_LIMIT];
+    int is_contig;
     int mpi_errno = MPI_SUCCESS;
 
     /* Piggyback candidate should have basic datatype for target datatype. */
@@ -963,31 +986,54 @@ static inline int perform_get_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     MPID_Datatype_get_size_macro(get_pkt->datatype, type_size);
     MPIU_Assign_trunc(len, get_pkt->count * type_size, size_t);
 
+    MPID_Datatype_is_contig(get_pkt->datatype, &is_contig);
+
     if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
         void *src = (void *) (get_pkt->addr), *dest = (void *) (get_resp_pkt->info.data);
         mpi_errno = immed_copy(src, dest, len);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
-    }
 
-    if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
         /* All origin data is in packet header, issue the header. */
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
         iovcnt = 1;
+
+        mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, sreq, iov, iovcnt);
+        if (mpi_errno != MPI_SUCCESS) {
+            MPID_Request_release(sreq);
+            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+        }
     }
-    else {
+    else if (is_contig) {
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
         iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) (get_pkt->addr);
         iov[1].MPID_IOV_LEN = get_pkt->count * type_size;
         iovcnt = 2;
+
+        mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, sreq, iov, iovcnt);
+        if (mpi_errno != MPI_SUCCESS) {
+            MPID_Request_release(sreq);
+            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+        }
     }
+    else {
+        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
+        iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
 
-    mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, sreq, iov, iovcnt);
-    if (mpi_errno != MPI_SUCCESS) {
-        MPID_Request_release(sreq);
-        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+        sreq->dev.segment_ptr = MPID_Segment_alloc();
+        MPIU_ERR_CHKANDJUMP1(sreq->dev.segment_ptr == NULL, mpi_errno,
+                             MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
+
+        MPID_Segment_init(get_pkt->addr, get_pkt->count,
+                          get_pkt->datatype, sreq->dev.segment_ptr, 0);
+        sreq->dev.segment_first = 0;
+        sreq->dev.segment_size = get_pkt->count * type_size;
+
+        mpi_errno = lock_entry->vc->sendNoncontig_fn(lock_entry->vc, sreq,
+                                                     iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
+        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
 
   fn_exit:
@@ -1051,6 +1097,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
     size_t len;
     int iovcnt;
     MPID_IOV iov[MPID_IOV_LIMIT];
+    int is_contig;
     int mpi_errno = MPI_SUCCESS;
 
     /* Piggyback candidate should have basic datatype for target datatype. */
@@ -1081,6 +1128,8 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
         MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED);
     }
 
+    MPID_Datatype_is_contig(get_accum_pkt->datatype, &is_contig);
+
     /* length of target data */
     MPIU_Assign_trunc(len, get_accum_pkt->count * type_size, size_t);
 
@@ -1098,9 +1147,24 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
             MPIU_ERR_POP(mpi_errno);
         }
     }
-    else {
+    else if (is_contig) {
         MPIU_Memcpy(sreq->dev.user_buf, get_accum_pkt->addr, get_accum_pkt->count * type_size);
     }
+    else {
+        MPID_Segment *seg = MPID_Segment_alloc();
+        MPI_Aint last = type_size * get_accum_pkt->count;
+
+        if (seg == NULL) {
+            if (win_ptr->shm_allocated == TRUE)
+                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+        }
+        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
+                             "MPID_Segment");
+        MPID_Segment_init(get_accum_pkt->addr, get_accum_pkt->count, get_accum_pkt->datatype, seg,
+                          0);
+        MPID_Segment_pack(seg, 0, &last, sreq->dev.user_buf);
+        MPID_Segment_free(seg);
+    }
 
     if (get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {
         /* All data fits in packet header */
@@ -1175,6 +1239,7 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     MPI_Aint type_size;
     MPID_IOV iov[MPID_IOV_LIMIT];
     int iovcnt;
+    int is_contig;
     int mpi_errno = MPI_SUCCESS;
 
     /* Piggyback candidate should have basic datatype for target datatype. */
@@ -1188,6 +1253,8 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
 
     MPID_Datatype_get_size_macro(fop_pkt->datatype, type_size);
 
+    MPID_Datatype_is_contig(fop_pkt->datatype, &is_contig);
+
     if (fop_pkt->flags & MPIDI_CH3_PKT_FOP_IMMED) {
         MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP_IMMED);
     }
@@ -1238,9 +1305,23 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
             MPIU_ERR_POP(mpi_errno);
         }
     }
-    else {
+    else if (is_contig) {
         MPIU_Memcpy(resp_req->dev.user_buf, fop_pkt->addr, type_size);
     }
+    else {
+        MPID_Segment *seg = MPID_Segment_alloc();
+        MPI_Aint last = type_size;
+
+        if (seg == NULL) {
+            if (win_ptr->shm_allocated == TRUE)
+                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+        }
+        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
+                             "MPID_Segment");
+        MPID_Segment_init(fop_pkt->addr, 1, fop_pkt->datatype, seg, 0);
+        MPID_Segment_pack(seg, 0, &last, resp_req->dev.user_buf);
+        MPID_Segment_free(seg);
+    }
 
     /* Apply the op */
     if (fop_pkt->op != MPI_NO_OP) {
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 03787af..d723cc1 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -124,6 +124,7 @@ int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
         MPI_Aint origin_type_size;
         size_t immed_len, len;
         int use_immed_pkt = FALSE;
+        int is_origin_contig, is_target_contig;
 
         /* queue it up */
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
@@ -159,11 +160,14 @@ int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
             new_ptr->is_dt = 1;
         }
 
+        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
+        MPID_Datatype_is_contig(target_datatype, &is_target_contig);
+
         MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
         MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);
 
         /* Judge if we can use IMMED data packet */
-        if (!new_ptr->is_dt) {
+        if (!new_ptr->is_dt && is_origin_contig && is_target_contig) {
             MPIU_Assign_trunc(immed_len,
                               (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
                               size_t);
@@ -318,6 +322,7 @@ int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
         MPI_Aint target_type_size;
         size_t immed_len, len;
         int use_immed_resp_pkt = FALSE;
+        int is_origin_contig, is_target_contig;
 
         /* queue it up */
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
@@ -353,11 +358,14 @@ int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
             new_ptr->is_dt = 1;
         }
 
+        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
+        MPID_Datatype_is_contig(target_datatype, &is_target_contig);
+
         MPID_Datatype_get_size_macro(target_datatype, target_type_size);
         MPIU_Assign_trunc(len, target_count * target_type_size, size_t);
 
         /* Judge if we can use IMMED data response packet */
-        if (!new_ptr->is_dt) {
+        if (!new_ptr->is_dt && is_origin_contig && is_target_contig) {
             MPIU_Assign_trunc(immed_len,
                               (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
                               size_t);
@@ -501,6 +509,7 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
         MPI_Aint origin_type_size;
         size_t immed_len, len;
         int use_immed_pkt = FALSE;
+        int is_origin_contig, is_target_contig;
 
         /* queue it up */
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
@@ -538,8 +547,11 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
         MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
         MPIU_Assign_trunc(len, origin_count * origin_type_size, size_t);
 
+        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
+        MPID_Datatype_is_contig(target_datatype, &is_target_contig);
+
         /* Judge if we can use IMMED data packet */
-        if (!new_ptr->is_dt) {
+        if (!new_ptr->is_dt && is_origin_contig && is_target_contig) {
             MPIU_Assign_trunc(immed_len,
                               (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
                               size_t);
@@ -712,6 +724,7 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             MPI_Aint target_type_size;
             size_t len, immed_len;
             int use_immed_resp_pkt = FALSE;
+            int is_result_contig, is_target_contig;
 
             /******************** Setting operation struct areas ***********************/
 
@@ -740,8 +753,11 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             MPID_Datatype_get_size_macro(target_datatype, target_type_size);
             MPIU_Assign_trunc(len, target_count * target_type_size, size_t);
 
+            MPID_Datatype_is_contig(result_datatype, &is_result_contig);
+            MPID_Datatype_is_contig(target_datatype, &is_target_contig);
+
             /* Judge if we can use IMMED data response packet */
-            if (!new_ptr->is_dt) {
+            if (!new_ptr->is_dt && is_result_contig && is_target_contig) {
                 MPIU_Assign_trunc(immed_len,
                                   (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
                                   size_t);
@@ -777,6 +793,7 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             MPI_Aint origin_type_size;
             size_t immed_len, orig_len;
             int use_immed_pkt = FALSE;
+            int is_origin_contig, is_target_contig, is_result_contig;
 
             /******************** Setting operation struct areas ***********************/
 
@@ -815,8 +832,12 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
             MPIU_Assign_trunc(orig_len, origin_count * origin_type_size, size_t);
 
+            MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
+            MPID_Datatype_is_contig(target_datatype, &is_target_contig);
+            MPID_Datatype_is_contig(result_datatype, &is_result_contig);
+
             /* Judge if we can use IMMED data packet */
-            if (!new_ptr->is_dt) {
+            if (!new_ptr->is_dt && is_origin_contig && is_target_contig && is_result_contig) {
                 MPIU_Assign_trunc(immed_len,
                                   (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
                                   size_t);
@@ -1231,6 +1252,7 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             MPI_Aint target_type_size;
             size_t immed_len;
             int use_immed_resp_pkt = FALSE;
+            int is_contig;
 
             /******************** Setting operation struct areas ***********************/
 
@@ -1246,12 +1268,16 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             MPID_Datatype_get_size_macro(datatype, target_type_size);
             MPIU_Assert(target_type_size <= sizeof(MPIDI_CH3_FOP_Immed_u));
 
-            /* Judege if we can use IMMED data for response packet */
-            MPIU_Assign_trunc(immed_len,
-                              (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
-                              size_t);
-            if (target_type_size <= immed_len)
-                use_immed_resp_pkt = TRUE;
+            MPID_Datatype_is_contig(datatype, &is_contig);
+
+            if (is_contig) {
+                /* Judege if we can use IMMED data for response packet */
+                MPIU_Assign_trunc(immed_len,
+                                  (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
+                                  size_t);
+                if (target_type_size <= immed_len)
+                    use_immed_resp_pkt = TRUE;
+            }
 
             get_pkt = &(new_ptr->pkt.get);
             MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
@@ -1270,6 +1296,7 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             MPI_Aint type_size;
             size_t immed_len;
             int use_immed_pkt = FALSE;
+            int is_contig;
 
             /******************** Setting operation struct areas ***********************/
 
@@ -1287,10 +1314,15 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             MPID_Datatype_get_size_macro(datatype, type_size);
             MPIU_Assert(type_size <= sizeof(MPIDI_CH3_FOP_Immed_u));
 
-            /* Judge if we can use IMMED data packet */
-            MPIU_Assign_trunc(immed_len, (MPIDI_RMA_IMMED_BYTES / type_size) * type_size, size_t);
-            if (type_size <= immed_len) {
-                use_immed_pkt = TRUE;
+            MPID_Datatype_is_contig(datatype, &is_contig);
+
+            if (is_contig) {
+                /* Judge if we can use IMMED data packet */
+                MPIU_Assign_trunc(immed_len,
+                                  (MPIDI_RMA_IMMED_BYTES / type_size) * type_size, size_t);
+                if (type_size <= immed_len) {
+                    use_immed_pkt = TRUE;
+                }
             }
 
             fop_pkt = &(new_ptr->pkt.fop);
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index fe6af1e..9061bb9 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -407,6 +407,7 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         MPIDI_CH3_Pkt_get_resp_t *get_resp_pkt = &upkt.get_resp;
         size_t len;
         int iovcnt;
+        int is_contig;
 
         MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_RESP);
         req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetSendComplete;
@@ -432,6 +433,8 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         /* length of target data */
         MPID_Datatype_get_size_macro(get_pkt->datatype, type_size);
 
+        MPID_Datatype_is_contig(get_pkt->datatype, &is_contig);
+
         if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
             MPIU_Assign_trunc(len, get_pkt->count * type_size, size_t);
             void *src = (void *) (get_pkt->addr), *dest = (void *) (get_resp_pkt->info.data);
@@ -442,25 +445,54 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
             iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
             iovcnt = 1;
+
+            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+            mpi_errno = MPIDI_CH3_iSendv(vc, req, iov, iovcnt);
+            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+            /* --BEGIN ERROR HANDLING-- */
+            if (mpi_errno != MPI_SUCCESS) {
+                MPIU_Object_set_ref(req, 0);
+                MPIDI_CH3_Request_destroy(req);
+                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+            }
+            /* --END ERROR HANDLING-- */
         }
-        else {
+        else if (is_contig) {
             iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
             iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
             iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) get_pkt->addr);
             iov[1].MPID_IOV_LEN = get_pkt->count * type_size;
             iovcnt = 2;
+
+            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+            mpi_errno = MPIDI_CH3_iSendv(vc, req, iov, iovcnt);
+            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+            /* --BEGIN ERROR HANDLING-- */
+            if (mpi_errno != MPI_SUCCESS) {
+                MPIU_Object_set_ref(req, 0);
+                MPIDI_CH3_Request_destroy(req);
+                MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+            }
+            /* --END ERROR HANDLING-- */
         }
+        else {
+            iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
+            iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
 
-        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iSendv(vc, req, iov, iovcnt);
-        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-        /* --BEGIN ERROR HANDLING-- */
-        if (mpi_errno != MPI_SUCCESS) {
-            MPIU_Object_set_ref(req, 0);
-            MPIDI_CH3_Request_destroy(req);
-            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+            req->dev.segment_ptr = MPID_Segment_alloc();
+            MPIU_ERR_CHKANDJUMP1(req->dev.segment_ptr == NULL, mpi_errno,
+                                 MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
+
+            MPID_Segment_init(get_pkt->addr, get_pkt->count,
+                              get_pkt->datatype, req->dev.segment_ptr, 0);
+            req->dev.segment_first = 0;
+            req->dev.segment_size = get_pkt->count * type_size;
+
+            MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+            mpi_errno = vc->sendNoncontig_fn(vc, req, iov[0].MPID_IOV_BUF, iov[0].MPID_IOV_LEN);
+            MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+            MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
         }
-        /* --END ERROR HANDLING-- */
 
         *buflen = sizeof(MPIDI_CH3_Pkt_t);
         *rreqp = NULL;

http://git.mpich.org/mpich.git/commitdiff/67b69b2a199e8d7f81e6bf63d169441f22a80c16

commit 67b69b2a199e8d7f81e6bf63d169441f22a80c16
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 19:09:41 2015 -0600

    Make 'eltype' in datatype struct store basic type.
    
    'eltype' in datatype struct is originally used to store the
    builtin datatype. However, this is not correct when working
    with RMA ACC-like operation since ACC-like operation needs
    to work with basic type.
    
    In this patch we make the 'eltype' to store basic type.
    Note that (1) whenever we need the builtin type,
    we should call macro MPID_Datatype_get_basic_type instead
    of directly accessing 'eltype'; (2) 'element_size' and
    'n_elements' still represents builtin type, whereas 'eltype'
    represents basic type.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/binding/fortran/use_mpi/create_f90_util.c b/src/binding/fortran/use_mpi/create_f90_util.c
index 58985c5..f81c285 100644
--- a/src/binding/fortran/use_mpi/create_f90_util.c
+++ b/src/binding/fortran/use_mpi/create_f90_util.c
@@ -110,11 +110,13 @@ int MPIR_Create_unnamed_predefined( MPI_Datatype old, int combiner,
 #ifndef NDEBUG
         {
             MPI_Datatype old_basic = MPI_DATATYPE_NULL;
+            MPI_Datatype new_basic = MPI_DATATYPE_NULL;
             /* we used MPID_Type_contiguous and then stomped it's contents
              * information, so make sure that the eltype is usable by
              * MPID_Type_commit */
             MPID_Datatype_get_basic_type(old, old_basic);
-            MPIU_Assert(new_dtp->eltype == old_basic);
+            MPID_Datatype_get_basic_type(new_dtp->handle, new_basic);
+            MPIU_Assert(new_basic == old_basic);
         }
 #endif
 
diff --git a/src/mpi/datatype/get_elements_x.c b/src/mpi/datatype/get_elements_x.c
index 092c1af..9c09702 100644
--- a/src/mpi/datatype/get_elements_x.c
+++ b/src/mpi/datatype/get_elements_x.c
@@ -176,9 +176,11 @@ PMPI_LOCAL MPI_Count MPIR_Type_get_elements(MPI_Count *bytes_p,
         return MPIR_Type_get_basic_type_elements(bytes_p, count, datatype);
     }
     else if (datatype_ptr->element_size >= 0) {
+        MPI_Datatype basic_type = MPI_DATATYPE_NULL;
+        MPID_Datatype_get_basic_type(datatype_ptr->eltype, basic_type);
         return MPIR_Type_get_basic_type_elements(bytes_p,
                                                  count * datatype_ptr->n_elements,
-                                                 datatype_ptr->eltype);
+                                                 basic_type);
     }
     else {
         /* we have bytes left and still don't have a single element size; must
@@ -304,9 +306,11 @@ int MPIR_Get_elements_x_impl(const MPI_Status *status, MPI_Datatype datatype, MP
          * be in bytes
          */
         if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
+            MPI_Datatype basic_type = MPI_DATATYPE_NULL;
+            MPID_Datatype_get_basic_type(datatype_ptr->eltype, basic_type);
             *elements = MPIR_Type_get_basic_type_elements(&byte_count,
                                                           -1,
-                                                          datatype_ptr->eltype);
+                                                          basic_type);
         }
         else {
             /* Behaves just like MPI_Get_Count in the predefined case */
diff --git a/src/mpid/common/datatype/mpid_datatype.h b/src/mpid/common/datatype/mpid_datatype.h
index 9f1e51a..85887d1 100644
--- a/src/mpid/common/datatype/mpid_datatype.h
+++ b/src/mpid/common/datatype/mpid_datatype.h
@@ -45,6 +45,12 @@
 	    break;							\
  									\
     }									\
+    /* This macro returns the builtin type, if 'basic_type' is not      \
+     * a builtin type, it must be a pair type composed of different     \
+     * builtin types, so we return MPI_DATATYPE_NULL here.              \
+     */                                                                 \
+    if (HANDLE_GET_KIND(eltype_) != HANDLE_KIND_BUILTIN)                \
+        eltype_ = MPI_DATATYPE_NULL;                                    \
  } while(0)
 
 /* MPID_Datatype_release decrements the reference count on the MPID_Datatype
@@ -372,6 +378,8 @@ typedef struct MPID_Datatype {
      * if type is composed of more than one element type, then
      * eltype == MPI_DATATYPE_NULL and element_size == -1
      */
+    /* Note that here eltype refers to predefined type, not the builtin
+       type, whereas n_elements and element_size refers to builtin type. */
     int      eltype;
     MPI_Aint n_elements;
     MPI_Aint element_size;
diff --git a/src/mpid/common/datatype/mpid_type_create_pairtype.c b/src/mpid/common/datatype/mpid_type_create_pairtype.c
index 20882a7..28ea16f 100644
--- a/src/mpid/common/datatype/mpid_type_create_pairtype.c
+++ b/src/mpid/common/datatype/mpid_type_create_pairtype.c
@@ -121,7 +121,7 @@ int MPID_Type_create_pairtype(MPI_Datatype type,
 
     new_dtp->n_elements      = 2;
     new_dtp->element_size    = el_size;
-    new_dtp->eltype          = MPI_DATATYPE_NULL;
+    new_dtp->eltype          = type;
 
     new_dtp->has_sticky_lb   = 0;
     new_dtp->true_lb         = 0;

http://git.mpich.org/mpich.git/commitdiff/49dd90f4565e0c97397c25cf19444f4aefbdeab1

commit 49dd90f4565e0c97397c25cf19444f4aefbdeab1
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 20:59:14 2015 -0600

    Modify macro PAIRTYPE_SIZE_EXTENT to accept correct arguments.
    
    The original implementation of PAIRTYPE_SIZE_EXTENT is not
    correct because it directly modifies variables internally
    without letting the user pass them. This patch adds those
    variables in the argument list.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/common/datatype/mpid_type_create_pairtype.c b/src/mpid/common/datatype/mpid_type_create_pairtype.c
index d491800..20882a7 100644
--- a/src/mpid/common/datatype/mpid_type_create_pairtype.c
+++ b/src/mpid/common/datatype/mpid_type_create_pairtype.c
@@ -13,16 +13,17 @@
 /* PAIRTYPE_SIZE_EXTENT - calculates size, extent, etc. for pairtype by
  * defining the appropriate C type.
  */
-#define PAIRTYPE_SIZE_EXTENT(mt1_,ut1_,mt2_,ut2_)			\
+#define PAIRTYPE_SIZE_EXTENT(mt1_,ut1_,mt2_,ut2_, type_size_, type_extent_, \
+                             el_size_, true_ub_, alignsize_)            \
     {									\
 	struct { ut1_ a; ut2_ b; } foo;					\
-	type_size   = sizeof(foo.a) + sizeof(foo.b);			\
-	type_extent = (MPI_Aint) sizeof(foo);				\
-	el_size = (sizeof(foo.a) == sizeof(foo.b)) ? (int) sizeof(foo.a) : -1; \
-	true_ub = (MPI_VOID_PTR_CAST_TO_MPI_AINT ((char *) &foo.b -     \
+	type_size_   = sizeof(foo.a) + sizeof(foo.b);			\
+	type_extent_ = (MPI_Aint) sizeof(foo);				\
+	el_size_ = (sizeof(foo.a) == sizeof(foo.b)) ? (int) sizeof(foo.a) : -1; \
+	true_ub_ = (MPI_VOID_PTR_CAST_TO_MPI_AINT ((char *) &foo.b -     \
                                                   (char *) &foo.a)) +   \
                   (MPI_Aint) sizeof(foo.b);                             \
-	alignsize = MPIR_MAX(MPID_Datatype_get_basic_size(mt1_),	\
+	alignsize_ = MPIR_MAX(MPID_Datatype_get_basic_size(mt1_),	\
                              MPID_Datatype_get_basic_size(mt2_));	\
     }
 
@@ -87,19 +88,24 @@ int MPID_Type_create_pairtype(MPI_Datatype type,
 
     switch(type) {
 	case MPI_FLOAT_INT:
-	    PAIRTYPE_SIZE_EXTENT(MPI_FLOAT, float, MPI_INT, int);
+            PAIRTYPE_SIZE_EXTENT(MPI_FLOAT, float, MPI_INT, int,
+                                 type_size, type_extent, el_size, true_ub, alignsize);
 	    break;
 	case MPI_DOUBLE_INT:
-	    PAIRTYPE_SIZE_EXTENT(MPI_DOUBLE, double, MPI_INT, int);
+            PAIRTYPE_SIZE_EXTENT(MPI_DOUBLE, double, MPI_INT, int,
+                                 type_size, type_extent, el_size, true_ub, alignsize);
 	    break;
 	case MPI_LONG_INT:
-	    PAIRTYPE_SIZE_EXTENT(MPI_LONG, long, MPI_INT, int);
+            PAIRTYPE_SIZE_EXTENT(MPI_LONG, long, MPI_INT, int,
+                                 type_size, type_extent, el_size, true_ub, alignsize);
 	    break;
 	case MPI_SHORT_INT:
-	    PAIRTYPE_SIZE_EXTENT(MPI_SHORT, short, MPI_INT, int);
+            PAIRTYPE_SIZE_EXTENT(MPI_SHORT, short, MPI_INT, int,
+                                 type_size, type_extent, el_size, true_ub, alignsize);
 	    break;
 	case MPI_LONG_DOUBLE_INT:
-	    PAIRTYPE_SIZE_EXTENT(MPI_LONG_DOUBLE, long double, MPI_INT, int);
+            PAIRTYPE_SIZE_EXTENT(MPI_LONG_DOUBLE, long double, MPI_INT, int,
+                                 type_size, type_extent, el_size, true_ub, alignsize);
 	    break;
 	default:
 	    /* --BEGIN ERROR HANDLING-- */

http://git.mpich.org/mpich.git/commitdiff/7899a60219495a8c6e45299facdf15121ba167ae

commit 7899a60219495a8c6e45299facdf15121ba167ae
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:05:30 2015 -0800

    Code-refactoring: set iov in issue_from_origin_buffer.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index eda6617..2ba4274 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -183,11 +183,12 @@ static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
 #define FUNCNAME issue_from_origin_buffer
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPID_IOV * iov, MPIDI_VC_t * vc)
+static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPIDI_VC_t * vc)
 {
     MPI_Aint origin_type_size;
     MPI_Datatype target_datatype;
     MPID_Datatype *target_dtp = NULL, *origin_dtp = NULL;
+    MPID_IOV iov[MPID_IOV_LIMIT];
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER);
 
@@ -214,11 +215,18 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPID_IOV * iov, MPI
 
     MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
 
+    iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) & (rma_op->pkt);
+    iov[0].MPID_IOV_LEN = sizeof(rma_op->pkt);
+
     if (target_dtp == NULL) {
         /* basic datatype on target */
         if (origin_dtp == NULL) {
             /* basic datatype on origin */
             int iovcnt = 2;
+
+            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) rma_op->origin_addr);
+            iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
+
             MPIU_THREAD_CS_ENTER(CH3COMM, vc);
             mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, iovcnt, &rma_op->request);
             MPIU_THREAD_CS_EXIT(CH3COMM, vc);
@@ -324,10 +332,8 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
                         MPIDI_RMA_Target_t * target_ptr, MPIDI_CH3_Pkt_flags_t flags)
 {
     MPIDI_VC_t *vc = NULL;
-    MPI_Aint origin_type_size;
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_put_t *put_pkt = &rma_op->pkt.put;
-    MPID_IOV iov[MPID_IOV_LIMIT];
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_PUT_OP);
 
@@ -337,8 +343,6 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
 
     put_pkt->flags |= flags;
 
-    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
-
     MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
 
     if (rma_op->pkt.type == MPIDI_CH3_PKT_PUT_IMMED) {
@@ -349,13 +353,7 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        /* We still need to issue from origin buffer. */
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) put_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*put_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) rma_op->origin_addr);
-        iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
-
-        mpi_errno = issue_from_origin_buffer(rma_op, iov, vc);
+        mpi_errno = issue_from_origin_buffer(rma_op, vc);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -379,10 +377,8 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
                         MPIDI_RMA_Target_t * target_ptr, MPIDI_CH3_Pkt_flags_t flags)
 {
     MPIDI_VC_t *vc = NULL;
-    MPI_Aint origin_type_size;
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_accum_t *accum_pkt = &rma_op->pkt.accum;
-    MPID_IOV iov[MPID_IOV_LIMIT];
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_ACC_OP);
 
@@ -392,8 +388,6 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
 
     accum_pkt->flags |= flags;
 
-    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
-
     MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
 
     if (rma_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE_IMMED) {
@@ -404,13 +398,7 @@ static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        /* We still need to issue from origin buffer. */
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) accum_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*accum_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) rma_op->origin_addr);
-        iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
-
-        mpi_errno = issue_from_origin_buffer(rma_op, iov, vc);
+        mpi_errno = issue_from_origin_buffer(rma_op, vc);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -432,10 +420,8 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
                             MPIDI_RMA_Target_t * target_ptr, MPIDI_CH3_Pkt_flags_t flags)
 {
     MPIDI_VC_t *vc = NULL;
-    MPI_Aint origin_type_size;
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt = &rma_op->pkt.get_accum;
-    MPID_IOV iov[MPID_IOV_LIMIT];
     MPID_Request *resp_req = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_GET_ACC_OP);
@@ -471,8 +457,6 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
 
     get_accum_pkt->flags |= flags;
 
-    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
-
     MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
 
     if (rma_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {
@@ -484,13 +468,7 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        /* We still need to issue from origin buffer. */
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*get_accum_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) rma_op->origin_addr);
-        iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
-
-        mpi_errno = issue_from_origin_buffer(rma_op, iov, vc);
+        mpi_errno = issue_from_origin_buffer(rma_op, vc);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
@@ -714,8 +692,6 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
     MPID_Comm *comm_ptr = win_ptr->comm_ptr;
     MPIDI_CH3_Pkt_fop_t *fop_pkt = &rma_op->pkt.fop;
     MPID_Request *resp_req = NULL;
-    MPI_Aint origin_type_size;
-    MPID_IOV iov[MPID_IOV_LIMIT];
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_FOP_OP);
 
@@ -742,8 +718,6 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
 
     MPIDI_Comm_get_vc_set_active(comm_ptr, rma_op->target_rank, &vc);
 
-    MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
-
     if (rma_op->pkt.type == MPIDI_CH3_PKT_FOP_IMMED) {
         /* All origin data is in packet header, issue the header. */
         MPIU_THREAD_CS_ENTER(CH3COMM, vc);
@@ -752,13 +726,7 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
-        /* We still need to issue from origin buffer. */
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) fop_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*fop_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) rma_op->origin_addr);
-        iov[1].MPID_IOV_LEN = origin_type_size;
-
-        mpi_errno = issue_from_origin_buffer(rma_op, iov, vc);
+        mpi_errno = issue_from_origin_buffer(rma_op, vc);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }

http://git.mpich.org/mpich.git/commitdiff/fa7fe99923c40f216d0709f55ad841ab537e5155

commit fa7fe99923c40f216d0709f55ad841ab537e5155
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 05:30:21 2015 -0600

    Add FIXME about extending piggybacking LOCK to op with derived DT.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index c8eda8a..03787af 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -173,6 +173,9 @@ int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
 
         /* Judge if this operation is an piggyback candidate */
         if (!new_ptr->is_dt) {
+            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
+             * for both origin and target data. We should extend this optimization to derived
+             * datatypes as well. */
             if (len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
                 new_ptr->piggyback_lock_candidate = 1;
         }
@@ -364,6 +367,9 @@ int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
 
         /* Judge if this operation is an piggyback candidate. */
         if (!new_ptr->is_dt) {
+            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
+             * for both origin and target data. We should extend this optimization to derived
+             * datatypes as well. */
             new_ptr->piggyback_lock_candidate = 1;
         }
 
@@ -543,6 +549,9 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
 
         /* Judge if this operation is an piggyback candidate. */
         if (!new_ptr->is_dt) {
+            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
+             * for both origin and target data. We should extend this optimization to derived
+             * datatypes as well. */
             if (len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
                 new_ptr->piggyback_lock_candidate = 1;
         }
@@ -742,6 +751,9 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
 
             /* Judge if this operation is a piggyback candidate */
             if (!new_ptr->is_dt) {
+                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
+                 * for both origin and target data. We should extend this optimization to derived
+                 * datatypes as well. */
                 new_ptr->piggyback_lock_candidate = 1;
             }
 
@@ -814,6 +826,9 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
 
             /* Judge if this operation is a piggyback candidate */
             if (!new_ptr->is_dt) {
+                /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
+                 * for origin, target and result data. We should extend this optimization to derived
+                 * datatypes as well. */
                 if (orig_len <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
                     new_ptr->piggyback_lock_candidate = 1;
             }

http://git.mpich.org/mpich.git/commitdiff/2317b31de63148e3248eecac923dd5c4fa87b001

commit 2317b31de63148e3248eecac923dd5c4fa87b001
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:04:20 2015 -0800

    Simplify code: deleting derived DT code for op piggybacked with LOCK.
    
    We piggyback LOCK flag with operations that does not use
    derived datatypes. Therefore, here we delete the unnecessary
    code that deal with derived datatypes in piggyback LOCK code.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 337a003..8ad48fd 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -1099,24 +1099,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
         }
     }
     else {
-        if (MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype)) {
-            MPIU_Memcpy(sreq->dev.user_buf, get_accum_pkt->addr, get_accum_pkt->count * type_size);
-        }
-        else {
-            MPID_Segment *seg = MPID_Segment_alloc();
-            MPI_Aint last = type_size * get_accum_pkt->count;
-
-            if (seg == NULL) {
-                if (win_ptr->shm_allocated == TRUE)
-                    MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-            }
-            MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                 "MPID_Segment");
-            MPID_Segment_init(get_accum_pkt->addr, get_accum_pkt->count, get_accum_pkt->datatype,
-                              seg, 0);
-            MPID_Segment_pack(seg, 0, &last, sreq->dev.user_buf);
-            MPID_Segment_free(seg);
-        }
+        MPIU_Memcpy(sreq->dev.user_buf, get_accum_pkt->addr, get_accum_pkt->count * type_size);
     }
 
     if (get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {

http://git.mpich.org/mpich.git/commitdiff/344bf9589fe22aedfa6f270d2f7054be0eff5cc4

commit 344bf9589fe22aedfa6f270d2f7054be0eff5cc4
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 19:03:23 2015 -0800

    Simplify code: not using flag MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP for GACC/FOP.
    
    Flag MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP is used to tell the target
    if the response packet of current GET, GACC and FOP should use
    IMMED packet type. We use IMMED packet type only when
    origin/target/result datatypes are all basic types.
    Since the target does not know origin/result datatypes, origin
    process needs to set a flag to inform the target.
    
    However, this usage is redundant for GACC and FOP packets. The
    reason is that, when we use IMMED packet type for GACC/FOP packets,
    origin/target/result datatypes must be basic types,
    in such case, we must use IMMED packet type for response packets
    as well, and usage of MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP and
    related code is not necessary. In short,
    flag MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP is useful only for GET operation.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 8b04b1b..337a003 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -226,12 +226,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
 
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
-    if ((rreq->dev.flags) & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
-        MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED);
-    }
-    else {
-        MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
-    }
+    MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
     get_accum_resp_pkt->request_handle = rreq->dev.resp_request_handle;
     get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
     get_accum_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
@@ -249,44 +244,30 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
     MPIU_ERR_CHKANDJUMP(resp_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
     MPIU_Object_set_ref(resp_req, 1);
 
-    if (!((rreq->dev.flags) & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP)) {
-        MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, rreq->dev.user_count * type_size,
-                            mpi_errno, "GACC resp. buffer");
-    }
+    MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, rreq->dev.user_count * type_size,
+                        mpi_errno, "GACC resp. buffer");
 
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 
-    if ((rreq->dev.flags) & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
-        void *src = (void *) (rreq->dev.real_user_buf), *dest =
-            (void *) (get_accum_resp_pkt->info.data);
-        mpi_errno = immed_copy(src, dest, rreq->dev.user_count * type_size);
-        if (mpi_errno != MPI_SUCCESS) {
-            if (win_ptr->shm_allocated == TRUE)
-                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-            MPIU_ERR_POP(mpi_errno);
-        }
+    if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype)) {
+        MPIU_Memcpy(resp_req->dev.user_buf, rreq->dev.real_user_buf,
+                    rreq->dev.user_count * type_size);
     }
     else {
-        if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype)) {
-            MPIU_Memcpy(resp_req->dev.user_buf, rreq->dev.real_user_buf,
-                        rreq->dev.user_count * type_size);
-        }
-        else {
-            MPID_Segment *seg = MPID_Segment_alloc();
-            MPI_Aint last = type_size * rreq->dev.user_count;
+        MPID_Segment *seg = MPID_Segment_alloc();
+        MPI_Aint last = type_size * rreq->dev.user_count;
 
-            if (seg == NULL) {
-                if (win_ptr->shm_allocated == TRUE)
-                    MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-            }
-            MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                 "MPID_Segment");
-            MPID_Segment_init(rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype,
-                              seg, 0);
-            MPID_Segment_pack(seg, 0, &last, resp_req->dev.user_buf);
-            MPID_Segment_free(seg);
+        if (seg == NULL) {
+            if (win_ptr->shm_allocated == TRUE)
+                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
         }
+        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
+                             "MPID_Segment");
+        MPID_Segment_init(rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype, seg,
+                          0);
+        MPID_Segment_pack(seg, 0, &last, resp_req->dev.user_buf);
+        MPID_Segment_free(seg);
     }
 
     /* accumulate data from tmp_buf into user_buf */
@@ -308,19 +289,11 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq
      * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter++;
 
-    if ((rreq->dev.flags) & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
-        /* All origin data is in packet header, issue the header. */
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
-        iovcnt = 1;
-    }
-    else {
-        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
-        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) resp_req->dev.user_buf);
-        iov[1].MPID_IOV_LEN = rreq->dev.user_count * type_size;
-        iovcnt = 2;
-    }
+    iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
+    iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
+    iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) resp_req->dev.user_buf);
+    iov[1].MPID_IOV_LEN = rreq->dev.user_count * type_size;
+    iovcnt = 2;
 
     MPIU_THREAD_CS_ENTER(CH3COMM, vc);
     mpi_errno = MPIDI_CH3_iSendv(vc, resp_req, iov, iovcnt);
@@ -372,8 +345,6 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, i
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_FOPRECVCOMPLETE);
 
-    MPIU_Assert(!(rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP));
-
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
     MPID_Datatype_get_size_macro(rreq->dev.datatype, type_size);
@@ -583,8 +554,6 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
 
-    MPIU_Assert(!(rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP));
-
     /* create derived datatype */
     create_derived_datatype(rreq, &new_dtp);
 
@@ -1106,7 +1075,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
 
     /* Copy data into a temporary buffer */
     MPID_Datatype_get_size_macro(get_accum_pkt->datatype, type_size);
-    if (!(get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP))
+    if (get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM)
         sreq->dev.user_buf = (void *) MPIU_Malloc(get_accum_pkt->count * type_size);
     else {
         MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED);
@@ -1119,7 +1088,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 
-    if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
+    if (get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {
         void *src = (void *) (get_accum_pkt->addr), *dest =
             (void *) (get_accum_resp_pkt->info.data);
         mpi_errno = immed_copy(src, dest, len);
@@ -1174,7 +1143,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
      * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter++;
 
-    if (!(get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP)) {
+    if (get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM) {
         MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
     }
     get_accum_resp_pkt->request_handle = get_accum_pkt->request_handle;
@@ -1187,7 +1156,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
         get_accum_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
     get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
 
-    if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
+    if (get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {
         /* All origin data is in packet header, issue the header. */
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
@@ -1236,7 +1205,7 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
 
     MPID_Datatype_get_size_macro(fop_pkt->datatype, type_size);
 
-    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
+    if (fop_pkt->flags & MPIDI_CH3_PKT_FOP_IMMED) {
         MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP_IMMED);
     }
     else {
@@ -1253,7 +1222,7 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
         (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
         fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
 
-    if (!(fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP)) {
+    if (fop_pkt->type == MPIDI_CH3_PKT_FOP) {
         resp_req = MPID_Request_create();
         if (resp_req == NULL) {
             MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomemreq");
@@ -1276,7 +1245,7 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 
-    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
+    if (fop_pkt->type == MPIDI_CH3_PKT_FOP_IMMED) {
         /* copy data to resp pkt header */
         void *src = fop_pkt->addr, *dest = fop_resp_pkt->info.data;
         mpi_errno = immed_copy(src, dest, type_size);
@@ -1308,7 +1277,7 @@ static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_e
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
-    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
+    if (fop_pkt->type == MPIDI_CH3_PKT_FOP_IMMED) {
         /* send back the original data */
         MPIU_THREAD_CS_ENTER(CH3COMM, lock_entry->vc);
         mpi_errno =
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 4e96661..c8eda8a 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -762,9 +762,9 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
 
         else {
             MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
-            MPI_Aint origin_type_size, target_type_size;
-            size_t immed_len, orig_len, tar_len;
-            int use_immed_pkt = FALSE, use_immed_resp_pkt = FALSE;
+            MPI_Aint origin_type_size;
+            size_t immed_len, orig_len;
+            int use_immed_pkt = FALSE;
 
             /******************** Setting operation struct areas ***********************/
 
@@ -803,9 +803,6 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
             MPIU_Assign_trunc(orig_len, origin_count * origin_type_size, size_t);
 
-            MPID_Datatype_get_size_macro(target_datatype, target_type_size);
-            MPIU_Assign_trunc(tar_len, target_count * target_type_size, size_t);
-
             /* Judge if we can use IMMED data packet */
             if (!new_ptr->is_dt) {
                 MPIU_Assign_trunc(immed_len,
@@ -813,12 +810,6 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                                   size_t);
                 if (orig_len <= immed_len)
                     use_immed_pkt = TRUE;
-
-                MPIU_Assign_trunc(immed_len,
-                                  (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
-                                  size_t);
-                if (tar_len <= immed_len)
-                    use_immed_resp_pkt = TRUE;
             }
 
             /* Judge if this operation is a piggyback candidate */
@@ -852,8 +843,6 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                 if (mpi_errno != MPI_SUCCESS)
                     MPIU_ERR_POP(mpi_errno);
             }
-            if (use_immed_resp_pkt)
-                get_accum_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
         }
 
         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
@@ -1265,7 +1254,7 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             MPIDI_CH3_Pkt_fop_t *fop_pkt;
             MPI_Aint type_size;
             size_t immed_len;
-            int use_immed_pkt = FALSE, use_immed_resp_pkt = FALSE;
+            int use_immed_pkt = FALSE;
 
             /******************** Setting operation struct areas ***********************/
 
@@ -1287,7 +1276,6 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             MPIU_Assign_trunc(immed_len, (MPIDI_RMA_IMMED_BYTES / type_size) * type_size, size_t);
             if (type_size <= immed_len) {
                 use_immed_pkt = TRUE;
-                use_immed_resp_pkt = TRUE;
             }
 
             fop_pkt = &(new_ptr->pkt.fop);
@@ -1310,8 +1298,6 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
                 if (mpi_errno != MPI_SUCCESS)
                     MPIU_ERR_POP(mpi_errno);
             }
-            if (use_immed_resp_pkt)
-                fop_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
         }
 
         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index 227f153..fe6af1e 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -776,14 +776,6 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumSendComplete;
         resp_req->dev.OnFinal = MPIDI_CH3_ReqHandler_GaccumSendComplete;
         resp_req->kind = MPID_REQUEST_SEND;
-        if (!(get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP)) {
-            tmp_buf = MPIU_Malloc(get_accum_pkt->count * type_size);
-            if (!tmp_buf) {
-                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                     get_accum_pkt->count * type_size);
-            }
-            resp_req->dev.user_buf = tmp_buf;
-        }
 
         /* here we increment the Active Target counter to guarantee the GET-like
          * operation are completed when counter reaches zero. */
@@ -792,13 +784,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         /* Calculate the length of reponse data, ensure that it fits into immed packet. */
         MPIU_Assign_trunc(len, get_accum_pkt->count * type_size, size_t);
 
-        if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
-            MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED);
-        }
-        else {
-            MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
-        }
-
+        MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED);
         get_accum_resp_pkt->request_handle = get_accum_pkt->request_handle;
         get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
         get_accum_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
@@ -812,19 +798,13 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         if (win_ptr->shm_allocated == TRUE)
             MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 
-        if (get_accum_resp_pkt->type == MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED) {
-            /* copy data from target buffer to response packet header */
-            src = (void *) (get_accum_pkt->addr), dest = (void *) (get_accum_resp_pkt->info.data);
-            mpi_errno = immed_copy(src, dest, len);
-            if (mpi_errno != MPI_SUCCESS) {
-                if (win_ptr->shm_allocated == TRUE)
-                    MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-                MPIU_ERR_POP(mpi_errno);
-            }
-        }
-        else {
-            MPIU_Memcpy(resp_req->dev.user_buf, get_accum_pkt->addr,
-                        get_accum_pkt->count * type_size);
+        /* copy data from target buffer to response packet header */
+        src = (void *) (get_accum_pkt->addr), dest = (void *) (get_accum_resp_pkt->info.data);
+        mpi_errno = immed_copy(src, dest, len);
+        if (mpi_errno != MPI_SUCCESS) {
+            if (win_ptr->shm_allocated == TRUE)
+                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+            MPIU_ERR_POP(mpi_errno);
         }
 
         /* perform accumulate operation. */
@@ -838,18 +818,9 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
 
-        if (get_accum_resp_pkt->type == MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED) {
-            iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
-            iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
-            iovcnt = 1;
-        }
-        else {
-            iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
-            iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
-            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) resp_req->dev.user_buf);
-            iov[1].MPID_IOV_LEN = get_accum_pkt->count * type_size;
-            iovcnt = 2;
-        }
+        iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
+        iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
+        iovcnt = 1;
 
         MPIU_THREAD_CS_ENTER(CH3COMM, vc);
         mpi_errno = MPIDI_CH3_iSendv(vc, resp_req, iov, iovcnt);
@@ -1198,8 +1169,6 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     if (pkt->type == MPIDI_CH3_PKT_FOP_IMMED) {
 
-        MPIU_Assert(fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP);
-
         MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP_IMMED);
         fop_resp_pkt->request_handle = fop_pkt->request_handle;
         fop_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
@@ -1269,8 +1238,6 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     else {
         MPIU_Assert(pkt->type == MPIDI_CH3_PKT_FOP);
 
-        MPIU_Assert(!(fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP));
-
         MPID_Request *req = NULL;
         char *data_buf = NULL;
         MPIDI_msg_sz_t data_len;

http://git.mpich.org/mpich.git/commitdiff/42b5fcf179fb3ad0b4fac74b8417d8ec2ca30e5e

commit 42b5fcf179fb3ad0b4fac74b8417d8ec2ca30e5e
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 03:38:13 2015 -0600

    Use function hook instead of function pointer for win_free.
    
    The original implementation of win_free is not correct. The
    problem is described as follows:
    
    It uses a function pointer which is initially set to the CH3
    implementation, and can be overridden by the channel layer if
    the channel provides an specific implementation.  In the CH3
    win_free implementation, it first checks if all RMA
    communication is finished and epoch states is reset, then
    performs a global barrier, then frees the window resources
    that are allocated in CH3, and finally returns. In the Nemesis
    win_free implementation, it directly frees the window resources
    that are allocated in Nemesis, and calls the CH3 win_free at last.
    This makes no sense because we free the window resources before
    checking if the RMA communication is completed.
    
    To fix this issue, we add a function hook for channel layer
    to free its own resources, the the function hook is called from
    the CH3 win_free.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
index 9b49ac4..a71ab58 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
@@ -70,20 +70,14 @@ int MPIDI_CH3_SHM_Win_shared_query(MPID_Win * win_ptr, int target_rank, MPI_Aint
 int MPIDI_CH3_SHM_Win_free(MPID_Win ** win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
-    mpir_errflag_t errflag = MPIR_ERR_NONE;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE);
 
     if ((*win_ptr)->comm_ptr->node_comm == NULL) {
-        mpi_errno = MPIDI_Win_free(win_ptr);
         goto fn_exit;
     }
 
-    mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
-    if (mpi_errno)
-        MPIU_ERR_POP(mpi_errno);
-
     /* Free shared memory region */
     if ((*win_ptr)->shm_allocated) {
         /* free shm_base_addrs that's only used for shared memory windows */
@@ -153,11 +147,6 @@ int MPIDI_CH3_SHM_Win_free(MPID_Win ** win_ptr)
         MPIDI_CH3I_SHM_Wins_unlink(&shm_wins_list, (*win_ptr));
     }
 
-    mpi_errno = MPIDI_Win_free(win_ptr);
-    if (mpi_errno != MPI_SUCCESS) {
-        MPIU_ERR_POP(mpi_errno);
-    }
-
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE);
     return mpi_errno;
diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
index 6f184d7..8a8e40d 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
@@ -64,6 +64,7 @@ int MPIDI_CH3_Win_hooks_init(MPIDI_CH3U_Win_hooks_t * win_hooks)
 
     if (MPIDI_CH3I_Shm_supported()) {
         win_hooks->win_init = MPIDI_CH3I_Win_init;
+        win_hooks->win_free = MPIDI_CH3_SHM_Win_free;
     }
 
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT);
@@ -294,7 +295,6 @@ static int MPIDI_CH3I_Win_detect_shm(MPID_Win ** win_ptr)
     /* TODO: should we use the same mutex or create a new one ?
      * It causes unnecessary synchronization.*/
     (*win_ptr)->shm_mutex = shm_win_ptr->shm_mutex;
-    (*win_ptr)->RMAFns.Win_free = MPIDI_CH3_SHM_Win_free;
 
   fn_exit:
     MPIU_CHKLMEM_FREEALL();
@@ -441,8 +441,6 @@ static int MPIDI_CH3I_Win_gather_info(void *base, MPI_Aint size, int disp_unit,
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
-    (*win_ptr)->RMAFns.Win_free = MPIDI_CH3_SHM_Win_free;
-
   fn_exit:
     MPIU_CHKLMEM_FREEALL();
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);
@@ -718,7 +716,6 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
 
     /* Provide operation overrides for this window flavor */
     (*win_ptr)->RMAFns.Win_shared_query = MPIDI_CH3_SHM_Win_shared_query;
-    (*win_ptr)->RMAFns.Win_free = MPIDI_CH3_SHM_Win_free;
 
     /* Cache SHM windows */
     MPIDI_CH3I_SHM_Wins_append(&shm_wins_list, (*win_ptr));
diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index 1bf6abe..684835a 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -1132,6 +1132,7 @@ extern MPIDI_CH3U_Win_fns_t MPIDI_CH3U_Win_fns;
 
 typedef struct {
     int (*win_init)(MPI_Aint, int, int, int, MPID_Info *, MPID_Comm *, MPID_Win **);
+    int (*win_free)(MPID_Win **);
 } MPIDI_CH3U_Win_hooks_t;
 
 extern MPIDI_CH3U_Win_hooks_t MPIDI_CH3U_Win_hooks;
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index 964f224..58a55a6 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -213,11 +213,14 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
             MPIU_ERR_POP(mpi_errno);
     }
 
-    if (!(*win_ptr)->shm_allocated) {
-        /* when SHM is allocated, we already did a global barrier in
-         * MPIDI_CH3_SHM_Win_free, so we do not need to do it again here. */
-        mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
-        if (mpi_errno)
+    mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+
+    /* Free window resources in lower layer. */
+    if (MPIDI_CH3U_Win_hooks.win_free != NULL) {
+        mpi_errno = MPIDI_CH3U_Win_hooks.win_free(win_ptr);
+        if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
     }
 

http://git.mpich.org/mpich.git/commitdiff/9dbcae0c71c803d75fe00854ad4a265b91c1902d

commit 9dbcae0c71c803d75fe00854ad4a265b91c1902d
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 03:31:52 2015 -0600

    Allow the channel layer to implement Win_gather_info function.
    
    In this patch, we first add a function pointer of Win_gather_info
    in CH3 to allow different channel layers to implement their own
    version of Win_gather_info function. The function pointer is
    initially set to the default implementation in CH3 layer. If the
    channel layer provides an implementation of Win_gather_info, it
    will override the function pointer.
    
    Secondly, we provide an implementation of Win_gather_info in the
    Nemesis layer. In this implementation, we allocate basic_info_table[]
    in the SHM region, so that processes on the same node can share the
    same base_info_table[].
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/include/mpid_nem_pre.h b/src/mpid/ch3/channels/nemesis/include/mpid_nem_pre.h
index b954b44..ca0a4b2 100644
--- a/src/mpid/ch3/channels/nemesis/include/mpid_nem_pre.h
+++ b/src/mpid/ch3/channels/nemesis/include/mpid_nem_pre.h
@@ -25,6 +25,9 @@ typedef pthread_mutex_t MPIDI_CH3I_SHM_MUTEX;
                                            accumulate/atomic operations */              \
     MPIU_SHMW_Hnd_t shm_mutex_segment_handle; /* handle to interprocess mutex memory    \
                                                  region */                              \
-
+                                                                                        \
+    void *info_shm_base_addr; /* base address of shared memory region for window info */          \
+    MPI_Aint info_shm_segment_len; /* size of shared memory region for window info */             \
+    MPIU_SHMW_Hnd_t info_shm_segment_handle; /* handle to shared memory region for window info */ \
 
 #endif /* MPID_NEM_PRE_H */
diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
index 863fdc2..9b49ac4 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
@@ -134,6 +134,19 @@ int MPIDI_CH3_SHM_Win_free(MPID_Win ** win_ptr)
         MPIU_SHMW_Hnd_finalize(&(*win_ptr)->shm_mutex_segment_handle);
     }
 
+    /* Free shared memory region for window info */
+    if ((*win_ptr)->info_shm_base_addr != NULL) {
+        mpi_errno = MPIU_SHMW_Seg_detach((*win_ptr)->info_shm_segment_handle,
+                                         (char **) &(*win_ptr)->info_shm_base_addr,
+                                         (*win_ptr)->info_shm_segment_len);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+
+        MPIU_SHMW_Hnd_finalize(&(*win_ptr)->info_shm_segment_handle);
+
+        (*win_ptr)->basic_info_table = NULL;
+    }
+
     /* Unlink from global SHM window list if it is original shared window */
     if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED ||
         (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE) {
diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
index adc6526..6f184d7 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
@@ -26,6 +26,9 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
 
 static int MPIDI_CH3I_Win_detect_shm(MPID_Win ** win_ptr);
 
+static int MPIDI_CH3I_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPID_Info * info,
+                                      MPID_Comm * comm_ptr, MPID_Win ** win_ptr);
+
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3_Win_fns_init
 #undef FCNAME
@@ -40,6 +43,7 @@ int MPIDI_CH3_Win_fns_init(MPIDI_CH3U_Win_fns_t * win_fns)
     if (MPIDI_CH3I_Shm_supported()) {
         win_fns->allocate_shm = MPIDI_CH3I_Win_allocate_shm;
         win_fns->detect_shm = MPIDI_CH3I_Win_detect_shm;
+        win_fns->gather_info = MPIDI_CH3I_Win_gather_info;
     }
 
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_WIN_FNS_INIT);
@@ -86,6 +90,10 @@ static int MPIDI_CH3I_Win_init(MPI_Aint size, int disp_unit, int create_flavor,
     (*win_ptr)->shm_mutex = NULL;
     (*win_ptr)->shm_mutex_segment_handle = 0;
 
+    (*win_ptr)->info_shm_base_addr = NULL;
+    (*win_ptr)->info_shm_segment_len = 0;
+    (*win_ptr)->info_shm_segment_handle = 0;
+
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_WIN_INIT);
     return mpi_errno;
@@ -300,6 +308,152 @@ static int MPIDI_CH3I_Win_detect_shm(MPID_Win ** win_ptr)
 }
 
 #undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Win_gather_info
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPIDI_CH3I_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPID_Info * info,
+                                      MPID_Comm * comm_ptr, MPID_Win ** win_ptr)
+{
+    MPID_Comm *node_comm_ptr = NULL;
+    int node_rank, node_size;
+    int comm_rank, comm_size;
+    MPI_Aint *tmp_buf = NULL;
+    int i, k;
+    mpir_errflag_t errflag = MPIR_ERR_NONE;
+    int mpi_errno = MPI_SUCCESS;
+    MPIU_CHKLMEM_DECL(1);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);
+
+    if ((*win_ptr)->comm_ptr->node_comm == NULL) {
+        mpi_errno = MPIDI_CH3U_Win_gather_info(base, size, disp_unit, info, comm_ptr, win_ptr);
+        goto fn_exit;
+    }
+
+    comm_size = (*win_ptr)->comm_ptr->local_size;
+    comm_rank = (*win_ptr)->comm_ptr->rank;
+
+    node_comm_ptr = (*win_ptr)->comm_ptr->node_comm;
+    MPIU_Assert(node_comm_ptr != NULL);
+    node_size = node_comm_ptr->local_size;
+    node_rank = node_comm_ptr->rank;
+
+    (*win_ptr)->info_shm_segment_len = node_size * sizeof(MPIDI_Win_basic_info_t);
+
+    mpi_errno = MPIU_SHMW_Hnd_init(&(*win_ptr)->info_shm_segment_handle);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+
+    if (node_rank == 0) {
+        char *serialized_hnd_ptr = NULL;
+
+        /* create shared memory region for all processes in win and map. */
+        mpi_errno = MPIU_SHMW_Seg_create_and_attach((*win_ptr)->info_shm_segment_handle,
+                                                    (*win_ptr)->info_shm_segment_len,
+                                                    (char **) &(*win_ptr)->info_shm_base_addr, 0);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+
+        /* serialize handle and broadcast it to the other processes in win */
+        mpi_errno =
+            MPIU_SHMW_Hnd_get_serialized_by_ref((*win_ptr)->info_shm_segment_handle,
+                                                &serialized_hnd_ptr);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+
+        mpi_errno =
+            MPIR_Bcast_impl(serialized_hnd_ptr, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
+                            &errflag);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+
+        /* wait for other processes to attach to win */
+        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+
+        /* unlink shared memory region so it gets deleted when all processes exit */
+        mpi_errno = MPIU_SHMW_Seg_remove((*win_ptr)->info_shm_segment_handle);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+    }
+    else {
+        char serialized_hnd[MPIU_SHMW_GHND_SZ] = { 0 };
+
+        /* get serialized handle from rank 0 and deserialize it */
+        mpi_errno =
+            MPIR_Bcast_impl(serialized_hnd, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
+                            &errflag);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+
+        mpi_errno = MPIU_SHMW_Hnd_deserialize((*win_ptr)->info_shm_segment_handle, serialized_hnd,
+                                              strlen(serialized_hnd));
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+
+        /* attach to shared memory region created by rank 0 */
+        mpi_errno =
+            MPIU_SHMW_Seg_attach((*win_ptr)->info_shm_segment_handle,
+                                 (*win_ptr)->info_shm_segment_len,
+                                 (char **) &(*win_ptr)->info_shm_base_addr, 0);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+
+        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+    }
+
+    (*win_ptr)->basic_info_table = (MPIDI_Win_basic_info_t *) ((*win_ptr)->info_shm_base_addr);
+
+    MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint),
+                        mpi_errno, "tmp_buf");
+
+    tmp_buf[4 * comm_rank] = MPIU_PtrToAint(base);
+    tmp_buf[4 * comm_rank + 1] = size;
+    tmp_buf[4 * comm_rank + 2] = (MPI_Aint) disp_unit;
+    tmp_buf[4 * comm_rank + 3] = (MPI_Aint) (*win_ptr)->handle;
+
+    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT,
+                                    (*win_ptr)->comm_ptr, &errflag);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+
+    if (node_rank == 0) {
+        /* only node_rank == 0 writes results to basic_info_table on shared memory region. */
+        k = 0;
+        for (i = 0; i < comm_size; i++) {
+            (*win_ptr)->basic_info_table[i].base_addr = MPIU_AintToPtr(tmp_buf[k++]);
+            (*win_ptr)->basic_info_table[i].size = tmp_buf[k++];
+            (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++];
+            (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++];
+        }
+    }
+
+    /* Make sure that all local processes see the results written by node_rank == 0 */
+    mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+
+    (*win_ptr)->RMAFns.Win_free = MPIDI_CH3_SHM_Win_free;
+
+  fn_exit:
+    MPIU_CHKLMEM_FREEALL();
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+  fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+#undef FUNCNAME
 #define FUNCNAME MPIDI_CH3I_Win_allocate_shm
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
@@ -308,15 +462,13 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
 {
     int mpi_errno = MPI_SUCCESS;
     void **base_pp = (void **) base_ptr;
-    int i, k, comm_size, rank;
-    int node_size, node_rank;
+    int i, node_size, node_rank;
     MPID_Comm *node_comm_ptr;
     MPI_Aint *node_sizes;
-    MPI_Aint *tmp_buf;
     mpir_errflag_t errflag = MPIR_ERR_NONE;
     int noncontig = FALSE;
-    MPIU_CHKPMEM_DECL(2);
-    MPIU_CHKLMEM_DECL(2);
+    MPIU_CHKPMEM_DECL(1);
+    MPIU_CHKLMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
@@ -332,9 +484,6 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
 
     (*win_ptr)->shm_allocated = TRUE;
 
-    comm_size = (*win_ptr)->comm_ptr->local_size;
-    rank = (*win_ptr)->comm_ptr->rank;
-
     /* When allocating shared memory region segment, we need comm of processes
      * that are on the same node as this process (node_comm).
      * If node_comm == NULL, this process is the only one on this node, therefore
@@ -350,10 +499,6 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
     MPIU_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **,
                         node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");
 
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->basic_info_table, MPIDI_Win_basic_info_t *,
-                        comm_size * sizeof(MPIDI_Win_basic_info_t),
-                        mpi_errno, "(*win_ptr)->base_info_table");
-
     /* get the sizes of the windows and window objectsof
      * all processes.  allocate temp. buffer for communication */
     MPIU_CHKLMEM_MALLOC(node_sizes, MPI_Aint *, node_size * sizeof(MPI_Aint), mpi_errno,
@@ -564,31 +709,12 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
         (*win_ptr)->base = (*win_ptr)->shm_base_addrs[node_rank];
     }
 
-    MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint),
-                        mpi_errno, "tmp_buf");
-
-    /* get the base addresses of the windows.  Note we reuse tmp_buf from above
-     * since it's at least as large as we need it for this allgather. */
-    tmp_buf[4 * rank] = MPIU_PtrToAint((*win_ptr)->base);
-    tmp_buf[4 * rank + 1] = size;
-    tmp_buf[4 * rank + 2] = (MPI_Aint) disp_unit;
-    tmp_buf[4 * rank + 3] = (MPI_Aint) (*win_ptr)->handle;
+    *base_pp = (*win_ptr)->base;
 
-    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
-                                    tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag);
-    if (mpi_errno)
+    /* gather window information among processes via shared memory region. */
+    mpi_errno = MPIDI_CH3I_Win_gather_info((*base_pp), size, disp_unit, info, comm_ptr, win_ptr);
+    if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
-    MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
-
-    k = 0;
-    for (i = 0; i < comm_size; ++i) {
-        (*win_ptr)->basic_info_table[i].base_addr = MPIU_AintToPtr(tmp_buf[k++]);
-        (*win_ptr)->basic_info_table[i].size = tmp_buf[k++];
-        (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++];
-        (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++];
-    }
-
-    *base_pp = (*win_ptr)->base;
 
     /* Provide operation overrides for this window flavor */
     (*win_ptr)->RMAFns.Win_shared_query = MPIDI_CH3_SHM_Win_shared_query;
diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index 3f85e0e..1bf6abe 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -1125,6 +1125,7 @@ typedef struct {
     int (*allocate_shm)(MPI_Aint, int, MPID_Info *, MPID_Comm *, void *, MPID_Win **);
     int (*create_dynamic)(MPID_Info *, MPID_Comm *, MPID_Win **);
     int (*detect_shm)(MPID_Win **);
+    int (*gather_info)(void *, MPI_Aint, int, MPID_Info *, MPID_Comm *, MPID_Win **);
 } MPIDI_CH3U_Win_fns_t;
 
 extern MPIDI_CH3U_Win_fns_t MPIDI_CH3U_Win_fns;
diff --git a/src/mpid/ch3/src/ch3u_win_fns.c b/src/mpid/ch3/src/ch3u_win_fns.c
index 575d0eb..974ef66 100644
--- a/src/mpid/ch3/src/ch3u_win_fns.c
+++ b/src/mpid/ch3/src/ch3u_win_fns.c
@@ -25,6 +25,7 @@ int MPIDI_Win_fns_init(MPIDI_CH3U_Win_fns_t * win_fns)
     win_fns->allocate = MPIDI_CH3U_Win_allocate;
     win_fns->allocate_shared = MPIDI_CH3U_Win_allocate;
     win_fns->create_dynamic = MPIDI_CH3U_Win_create_dynamic;
+    win_fns->gather_info = MPIDI_CH3U_Win_gather_info;
 
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FNS_INIT);
 
@@ -111,10 +112,9 @@ int MPIDI_CH3U_Win_create(void *base, MPI_Aint size, int disp_unit, MPID_Info *
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE);
 
-    mpi_errno = MPIDI_CH3U_Win_gather_info(base, size, disp_unit, info, comm_ptr, win_ptr);
-    if (mpi_errno != MPI_SUCCESS) {
+    mpi_errno = MPIDI_CH3U_Win_fns.gather_info(base, size, disp_unit, info, comm_ptr, win_ptr);
+    if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
-    }
 
     if ((*win_ptr)->info_args.alloc_shm == TRUE && MPIDI_CH3U_Win_fns.detect_shm != NULL) {
         /* Detect if shared buffers are specified for the processes in the
@@ -146,10 +146,9 @@ int MPIDI_CH3U_Win_create_dynamic(MPID_Info * info, MPID_Comm * comm_ptr, MPID_W
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE_DYNAMIC);
 
-    mpi_errno = MPIDI_CH3U_Win_gather_info(MPI_BOTTOM, 0, 1, info, comm_ptr, win_ptr);
-    if (mpi_errno != MPI_SUCCESS) {
+    mpi_errno = MPIDI_CH3U_Win_fns.gather_info(MPI_BOTTOM, 0, 1, info, comm_ptr, win_ptr);
+    if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
-    }
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_CREATE_DYNAMIC);
@@ -267,7 +266,7 @@ int MPIDI_CH3U_Win_allocate_no_shm(MPI_Aint size, int disp_unit, MPID_Info * inf
 
     (*win_ptr)->base = *base_pp;
 
-    mpi_errno = MPIDI_CH3U_Win_gather_info(*base_pp, size, disp_unit, info, comm_ptr, win_ptr);
+    mpi_errno = MPIDI_CH3U_Win_fns.gather_info(*base_pp, size, disp_unit, info, comm_ptr, win_ptr);
     if (mpi_errno != MPI_SUCCESS) {
         MPIU_ERR_POP(mpi_errno);
     }
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index 6ef5991..964f224 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -233,7 +233,8 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
     if (mpi_errno)
         MPIU_ERR_POP(mpi_errno);
 
-    MPIU_Free((*win_ptr)->basic_info_table);
+    if ((*win_ptr)->basic_info_table != NULL)
+        MPIU_Free((*win_ptr)->basic_info_table);
     MPIU_Free((*win_ptr)->op_pool_start);
     MPIU_Free((*win_ptr)->target_pool_start);
     MPIU_Free((*win_ptr)->slots);

http://git.mpich.org/mpich.git/commitdiff/7c1a8fb119c8c903d1ecd6f45b02cf3d2688f9dc

commit 7c1a8fb119c8c903d1ecd6f45b02cf3d2688f9dc
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Tue Mar 3 02:58:52 2015 -0600

    Add a function hook to initialize window attributes in channel layer.
    
    There are some window attributes in the channel layer that
    needs to be initialized during window creation. In this
    patch, we first add a win_hooks table that contains pointers
    to the channel's implementation of the function hooks. Secondly,
    we add a function hook 'win_init' to allow the channel layer to
    initialize its own attributes. The hook is called from the
    CH3 win_init function.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
index 4820e34..adc6526 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
@@ -18,6 +18,9 @@ MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_wincreate_allgather);
 
 MPIDI_SHM_Wins_list_t shm_wins_list;
 
+static int MPIDI_CH3I_Win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
+                               MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr);
+
 static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info * info,
                                        MPID_Comm * comm_ptr, void *base_ptr, MPID_Win ** win_ptr);
 
@@ -45,6 +48,54 @@ int MPIDI_CH3_Win_fns_init(MPIDI_CH3U_Win_fns_t * win_fns)
 }
 
 #undef FUNCNAME
+#define FUNCNAME MPIDI_CH3_Win_hooks_init
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3_Win_hooks_init(MPIDI_CH3U_Win_hooks_t * win_hooks)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT);
+
+    if (MPIDI_CH3I_Shm_supported()) {
+        win_hooks->win_init = MPIDI_CH3I_Win_init;
+    }
+
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT);
+
+    return mpi_errno;
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3_Win_init
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static int MPIDI_CH3I_Win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
+                               MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_INIT);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WIN_INIT);
+
+    (*win_ptr)->shm_base_addr = NULL;
+    (*win_ptr)->shm_segment_len = 0;
+    (*win_ptr)->shm_segment_handle = 0;
+    (*win_ptr)->shm_mutex = NULL;
+    (*win_ptr)->shm_mutex_segment_handle = 0;
+
+  fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_WIN_INIT);
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+
+}
+
+
+#undef FUNCNAME
 #define FUNCNAME MPIDI_CH3I_SHM_Wins_match
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
diff --git a/src/mpid/ch3/channels/sock/src/ch3_win_fns.c b/src/mpid/ch3/channels/sock/src/ch3_win_fns.c
index 6524e91..92b34b3 100644
--- a/src/mpid/ch3/channels/sock/src/ch3_win_fns.c
+++ b/src/mpid/ch3/channels/sock/src/ch3_win_fns.c
@@ -25,3 +25,21 @@ int MPIDI_CH3_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns)
 
     return mpi_errno;
 }
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3_Win_hooks_init
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+int MPIDI_CH3_Win_hooks_init(MPIDI_CH3U_Win_hooks_t *win_hooks)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT);
+
+    /* Sock doesn't implement any of the Window hooks */
+
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT);
+
+    return mpi_errno;
+}
diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index 88f5f1e..3f85e0e 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -1129,10 +1129,19 @@ typedef struct {
 
 extern MPIDI_CH3U_Win_fns_t MPIDI_CH3U_Win_fns;
 
+typedef struct {
+    int (*win_init)(MPI_Aint, int, int, int, MPID_Info *, MPID_Comm *, MPID_Win **);
+} MPIDI_CH3U_Win_hooks_t;
+
+extern MPIDI_CH3U_Win_hooks_t MPIDI_CH3U_Win_hooks;
+
 /* CH3 and Channel window functions initializers */
 int MPIDI_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns);
 int MPIDI_CH3_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns);
 
+/* Channel window hooks initializer */
+int MPIDI_CH3_Win_hooks_init(MPIDI_CH3U_Win_hooks_t *win_hooks);
+
 /* Default window creation functions provided by CH3 */
 int MPIDI_CH3U_Win_create(void *, MPI_Aint, int, MPID_Info *, MPID_Comm *,
                          MPID_Win **);
diff --git a/src/mpid/ch3/src/mpid_init.c b/src/mpid/ch3/src/mpid_init.c
index 6344f3b..e541f71 100644
--- a/src/mpid/ch3/src/mpid_init.c
+++ b/src/mpid/ch3/src/mpid_init.c
@@ -40,6 +40,7 @@ static int set_eager_threshold(MPID_Comm *comm_ptr, MPID_Info *info, void *state
 MPIDI_Process_t MPIDI_Process = { NULL };
 MPIDI_CH3U_SRBuf_element_t * MPIDI_CH3U_SRBuf_pool = NULL;
 MPIDI_CH3U_Win_fns_t MPIDI_CH3U_Win_fns = { NULL };
+MPIDI_CH3U_Win_hooks_t MPIDI_CH3U_Win_hooks = { NULL };
 
 
 #undef FUNCNAME
@@ -175,6 +176,7 @@ int MPID_Init(int *argc, char ***argv, int requested, int *provided,
        init function. */
     MPIDI_Win_fns_init(&MPIDI_CH3U_Win_fns);
     MPIDI_CH3_Win_fns_init(&MPIDI_CH3U_Win_fns);
+    MPIDI_CH3_Win_hooks_init(&MPIDI_CH3U_Win_hooks);
 
     /*
      * Let the channel perform any necessary initialization
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index fba1746..5e86d3a 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -394,6 +394,14 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     win_elem->win_ptr = *win_ptr;
     MPL_LL_APPEND(MPIDI_RMA_Win_list, MPIDI_RMA_Win_list_tail, win_elem);
 
+    if (MPIDI_CH3U_Win_hooks.win_init != NULL) {
+        mpi_errno =
+            MPIDI_CH3U_Win_hooks.win_init(size, disp_unit, create_flavor, model, info, comm_ptr,
+                                          win_ptr);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+    }
+
   fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_WIN_INIT);
     return mpi_errno;

http://git.mpich.org/mpich.git/commitdiff/eddd8b9145e4c5411065784edad0fa6780a38742

commit eddd8b9145e4c5411065784edad0fa6780a38742
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 18:57:54 2015 -0800

    Reduce size of shm_base_addrs[] from comm_size to node_size.
    
    Given one process, shm_base_addrs[] is used to store the base
    addresses (in the address space of this process) of SHM window
    on other processes. The original size of it is comm_size. However,
    the maximum number of SHM windows that this process can access
    to is node_size instead of comm_size, which results in a waste
    of memory since most slots in the array is NULL. In this patch
    we reduce the size of shm_base_addrs[] from comm_size to node_size.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
index 730117f..863fdc2 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
@@ -35,18 +35,23 @@ int MPIDI_CH3_SHM_Win_shared_query(MPID_Win * win_ptr, int target_rank, MPI_Aint
 
         for (i = 0; i < comm_size; i++) {
             if (win_ptr->basic_info_table[i].size > 0) {
+                int local_i = win_ptr->comm_ptr->intranode_table[i];
+                MPIU_Assert(local_i >= 0 && local_i < win_ptr->comm_ptr->node_comm->local_size);
                 *size = win_ptr->basic_info_table[i].size;
                 *disp_unit = win_ptr->basic_info_table[i].disp_unit;
-                *((void **) baseptr) = win_ptr->shm_base_addrs[i];
+                *((void **) baseptr) = win_ptr->shm_base_addrs[local_i];
                 break;
             }
         }
 
     }
     else {
+        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
+        MPIU_Assert(local_target_rank >= 0 &&
+                    local_target_rank < win_ptr->comm_ptr->node_comm->local_size);
         *size = win_ptr->basic_info_table[target_rank].size;
         *disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
-        *((void **) baseptr) = win_ptr->shm_base_addrs[target_rank];
+        *((void **) baseptr) = win_ptr->shm_base_addrs[local_target_rank];
     }
 
   fn_exit:
diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
index aaa8f6d..4820e34 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
@@ -194,7 +194,7 @@ static int MPIDI_CH3I_Win_detect_shm(MPID_Win ** win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Win *shm_win_ptr = NULL;
-    int i, comm_size, node_size;
+    int i, node_size;
     MPI_Aint *base_shm_offs;
 
     MPIU_CHKPMEM_DECL(1);
@@ -207,7 +207,6 @@ static int MPIDI_CH3I_Win_detect_shm(MPID_Win ** win_ptr)
     }
 
     node_size = (*win_ptr)->comm_ptr->node_comm->local_size;
-    comm_size = (*win_ptr)->comm_ptr->local_size;
 
     MPIU_CHKLMEM_MALLOC(base_shm_offs, MPI_Aint *, node_size * sizeof(MPI_Aint),
                         mpi_errno, "base_shm_offs");
@@ -224,22 +223,13 @@ static int MPIDI_CH3I_Win_detect_shm(MPID_Win ** win_ptr)
 
     (*win_ptr)->shm_allocated = TRUE;
     MPIU_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **,
-                        comm_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");
+                        node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");
 
     /* Compute the base address of shm buffer on each process.
      * shm_base_addrs[i] = my_shm_base_addr + off[i] */
-    for (i = 0; i < comm_size; i++) {
-        int i_node_rank;
-        i_node_rank = (*win_ptr)->comm_ptr->intranode_table[i];
-        if (i_node_rank >= 0) {
-            MPIU_Assert(i_node_rank < node_size);
-
-            (*win_ptr)->shm_base_addrs[i] =
-                (void *) ((MPI_Aint) shm_win_ptr->shm_base_addr + base_shm_offs[i_node_rank]);
-        }
-        else {
-            (*win_ptr)->shm_base_addrs[i] = NULL;
-        }
+    for (i = 0; i < node_size; i++) {
+        (*win_ptr)->shm_base_addrs[i] =
+            (void *) ((MPI_Aint) shm_win_ptr->shm_base_addr + base_shm_offs[i]);
     }
 
     /* TODO: should we use the same mutex or create a new one ?
@@ -271,12 +261,11 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
     int node_size, node_rank;
     MPID_Comm *node_comm_ptr;
     MPI_Aint *node_sizes;
-    void **node_shm_base_addrs;
     MPI_Aint *tmp_buf;
     mpir_errflag_t errflag = MPIR_ERR_NONE;
     int noncontig = FALSE;
     MPIU_CHKPMEM_DECL(2);
-    MPIU_CHKLMEM_DECL(3);
+    MPIU_CHKLMEM_DECL(2);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
@@ -308,7 +297,7 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
     /* allocate memory for the base addresses, disp_units, and
      * completion counters of all processes */
     MPIU_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **,
-                        comm_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");
+                        node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");
 
     MPIU_CHKPMEM_MALLOC((*win_ptr)->basic_info_table, MPIDI_Win_basic_info_t *,
                         comm_size * sizeof(MPIDI_Win_basic_info_t),
@@ -494,22 +483,10 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
         {
             char *cur_base;
             int cur_rank;
-            if ((*win_ptr)->create_flavor != MPI_WIN_FLAVOR_SHARED) {
-                /* If create flavor is not MPI_WIN_FLAVOR_SHARED, all processes on this
-                 * window may not be on the same node. Because we only need to calculate
-                 * local processes' shm_base_addrs using local processes's sizes,
-                 * we allocate a temporary array to place results and copy results
-                 * back to shm_base_addrs on the window at last. */
-                MPIU_CHKLMEM_MALLOC(node_shm_base_addrs, void **, node_size * sizeof(void *),
-                                    mpi_errno, "node_shm_base_addrs");
-            }
-            else {
-                node_shm_base_addrs = (*win_ptr)->shm_base_addrs;
-            }
 
             cur_base = (*win_ptr)->shm_base_addr;
             cur_rank = 0;
-            node_shm_base_addrs[0] = (*win_ptr)->shm_base_addr;
+            ((*win_ptr)->shm_base_addrs)[0] = (*win_ptr)->shm_base_addr;
             for (i = 1; i < node_size; ++i) {
                 if (node_sizes[i]) {
                     /* For the base addresses, we track the previous
@@ -518,36 +495,22 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
                      * previous process because rank "i-1" might not have
                      * allocated any memory. */
                     if (noncontig) {
-                        node_shm_base_addrs[i] =
+                        ((*win_ptr)->shm_base_addrs)[i] =
                             cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[cur_rank]);
                     }
                     else {
-                        node_shm_base_addrs[i] = cur_base + node_sizes[cur_rank];
+                        ((*win_ptr)->shm_base_addrs)[i] = cur_base + node_sizes[cur_rank];
                     }
-                    cur_base = node_shm_base_addrs[i];
+                    cur_base = ((*win_ptr)->shm_base_addrs)[i];
                     cur_rank = i;
                 }
                 else {
-                    node_shm_base_addrs[i] = NULL;
-                }
-            }
-
-            if ((*win_ptr)->create_flavor != MPI_WIN_FLAVOR_SHARED) {
-                /* if MPI_WIN_FLAVOR_SHARED is not set, copy from node_shm_base_addrs to
-                 * (*win_ptr)->shm_base_addrs */
-                for (i = 0; i < comm_size; i++) {
-                    if ((*win_ptr)->comm_ptr->intranode_table[i] >= 0) {
-                        MPIU_Assert((*win_ptr)->comm_ptr->intranode_table[i] < node_size);
-                        (*win_ptr)->shm_base_addrs[i] =
-                            node_shm_base_addrs[(*win_ptr)->comm_ptr->intranode_table[i]];
-                    }
-                    else
-                        (*win_ptr)->shm_base_addrs[i] = NULL;
+                    ((*win_ptr)->shm_base_addrs)[i] = NULL;
                 }
             }
         }
 
-        (*win_ptr)->base = (*win_ptr)->shm_base_addrs[rank];
+        (*win_ptr)->base = (*win_ptr)->shm_base_addrs[node_rank];
     }
 
     MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint),
diff --git a/src/mpid/ch3/include/mpid_rma_shm.h b/src/mpid/ch3/include/mpid_rma_shm.h
index 10e7007..4355cc7 100644
--- a/src/mpid/ch3/include/mpid_rma_shm.h
+++ b/src/mpid/ch3/include/mpid_rma_shm.h
@@ -260,7 +260,9 @@ static inline int MPIDI_CH3I_Shm_put_op(const void *origin_addr, int origin_coun
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_PUT_OP);
 
     if (win_ptr->shm_allocated == TRUE) {
-        base = win_ptr->shm_base_addrs[target_rank];
+        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
+        MPIU_Assert(local_target_rank >= 0);
+        base = win_ptr->shm_base_addrs[local_target_rank];
         disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
     }
     else {
@@ -304,8 +306,10 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);
 
     if (win_ptr->shm_allocated == TRUE) {
+        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
+        MPIU_Assert(local_target_rank >= 0);
         shm_op = 1;
-        base = win_ptr->shm_base_addrs[target_rank];
+        base = win_ptr->shm_base_addrs[local_target_rank];
         disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
     }
     else {
@@ -459,7 +463,9 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);
 
     if (win_ptr->shm_allocated == TRUE) {
-        base = win_ptr->shm_base_addrs[target_rank];
+        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
+        MPIU_Assert(local_target_rank >= 0);
+        base = win_ptr->shm_base_addrs[local_target_rank];
         disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
         shm_locked = 1;
@@ -624,7 +630,9 @@ static inline int MPIDI_CH3I_Shm_get_op(void *origin_addr, int origin_count,
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_GET_OP);
 
     if (win_ptr->shm_allocated == TRUE) {
-        base = win_ptr->shm_base_addrs[target_rank];
+        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
+        MPIU_Assert(local_target_rank >= 0);
+        base = win_ptr->shm_base_addrs[local_target_rank];
         disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
     }
     else {
@@ -666,7 +674,9 @@ static inline int MPIDI_CH3I_Shm_cas_op(const void *origin_addr, const void *com
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_CAS_OP);
 
     if (win_ptr->shm_allocated == TRUE) {
-        base = win_ptr->shm_base_addrs[target_rank];
+        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
+        MPIU_Assert(local_target_rank >= 0);
+        base = win_ptr->shm_base_addrs[local_target_rank];
         disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
 
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
@@ -723,7 +733,9 @@ static inline int MPIDI_CH3I_Shm_fop_op(const void *origin_addr, void *result_ad
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_FOP_OP);
 
     if (win_ptr->shm_allocated == TRUE) {
-        base = win_ptr->shm_base_addrs[target_rank];
+        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
+        MPIU_Assert(local_target_rank >= 0);
+        base = win_ptr->shm_base_addrs[local_target_rank];
         disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
 
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index 2e54aeb..fba1746 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -305,6 +305,7 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->comm_ptr = win_comm_ptr;
 
     (*win_ptr)->at_completion_counter = 0;
+    (*win_ptr)->shm_base_addrs = NULL;
     /* (*win_ptr)->basic_info_table[] is set by caller; */
     (*win_ptr)->current_lock_type = MPID_LOCK_NONE;
     (*win_ptr)->shared_lock_ref_cnt = 0;

http://git.mpich.org/mpich.git/commitdiff/9404e953b68ede106f2253e91301226adcbb5aa6

commit 9404e953b68ede106f2253e91301226adcbb5aa6
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 18:51:06 2015 -0800

    Store window basic attributes into a struct on window.
    
    In this patch, we gather window basic attributes of other
    processes (base_addr, size, disp_unit, win_handle) using a
    struct called "basic_info_table". By doing this, we can use
    one contiguous memory region to store them.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
index 8a640d7..730117f 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
@@ -34,9 +34,9 @@ int MPIDI_CH3_SHM_Win_shared_query(MPID_Win * win_ptr, int target_rank, MPI_Aint
         *((void **) baseptr) = NULL;
 
         for (i = 0; i < comm_size; i++) {
-            if (win_ptr->sizes[i] > 0) {
-                *size = win_ptr->sizes[i];
-                *disp_unit = win_ptr->disp_units[i];
+            if (win_ptr->basic_info_table[i].size > 0) {
+                *size = win_ptr->basic_info_table[i].size;
+                *disp_unit = win_ptr->basic_info_table[i].disp_unit;
                 *((void **) baseptr) = win_ptr->shm_base_addrs[i];
                 break;
             }
@@ -44,8 +44,8 @@ int MPIDI_CH3_SHM_Win_shared_query(MPID_Win * win_ptr, int target_rank, MPI_Aint
 
     }
     else {
-        *size = win_ptr->sizes[target_rank];
-        *disp_unit = win_ptr->disp_units[target_rank];
+        *size = win_ptr->basic_info_table[target_rank].size;
+        *disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
         *((void **) baseptr) = win_ptr->shm_base_addrs[target_rank];
     }
 
diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
index b6cfee9..aaa8f6d 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
@@ -154,7 +154,8 @@ static int MPIDI_CH3I_SHM_Wins_match(MPID_Win ** win_ptr, MPID_Win ** matched_wi
                 MPIU_Assert(i_node_rank < node_size);
 
                 if (base_shm_offs[i_node_rank] < 0 ||
-                    base_shm_offs[i_node_rank] + (*win_ptr)->sizes[i] > shm_win->shm_segment_len) {
+                    base_shm_offs[i_node_rank] + (*win_ptr)->basic_info_table[i].size >
+                    shm_win->shm_segment_len) {
                     base_diff = 1;
                     break;
                 }
@@ -274,7 +275,7 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
     MPI_Aint *tmp_buf;
     mpir_errflag_t errflag = MPIR_ERR_NONE;
     int noncontig = FALSE;
-    MPIU_CHKPMEM_DECL(6);
+    MPIU_CHKPMEM_DECL(2);
     MPIU_CHKLMEM_DECL(3);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
 
@@ -306,20 +307,12 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
     MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
     /* allocate memory for the base addresses, disp_units, and
      * completion counters of all processes */
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->base_addrs, void **,
-                        comm_size * sizeof(void *), mpi_errno, "(*win_ptr)->base_addrs");
-
     MPIU_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **,
                         comm_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");
 
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->sizes, MPI_Aint *, comm_size * sizeof(MPI_Aint),
-                        mpi_errno, "(*win_ptr)->sizes");
-
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->disp_units, int *, comm_size * sizeof(int),
-                        mpi_errno, "(*win_ptr)->disp_units");
-
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->all_win_handles, MPI_Win *,
-                        comm_size * sizeof(MPI_Win), mpi_errno, "(*win_ptr)->all_win_handles");
+    MPIU_CHKPMEM_MALLOC((*win_ptr)->basic_info_table, MPIDI_Win_basic_info_t *,
+                        comm_size * sizeof(MPIDI_Win_basic_info_t),
+                        mpi_errno, "(*win_ptr)->base_info_table");
 
     /* get the sizes of the windows and window objectsof
      * all processes.  allocate temp. buffer for communication */
@@ -575,10 +568,10 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
 
     k = 0;
     for (i = 0; i < comm_size; ++i) {
-        (*win_ptr)->base_addrs[i] = MPIU_AintToPtr(tmp_buf[k++]);
-        (*win_ptr)->sizes[i] = tmp_buf[k++];
-        (*win_ptr)->disp_units[i] = (int) tmp_buf[k++];
-        (*win_ptr)->all_win_handles[i] = (MPI_Win) tmp_buf[k++];
+        (*win_ptr)->basic_info_table[i].base_addr = MPIU_AintToPtr(tmp_buf[k++]);
+        (*win_ptr)->basic_info_table[i].size = tmp_buf[k++];
+        (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++];
+        (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++];
     }
 
     *base_pp = (*win_ptr)->base;
diff --git a/src/mpid/ch3/include/mpid_rma_shm.h b/src/mpid/ch3/include/mpid_rma_shm.h
index 7be3520..10e7007 100644
--- a/src/mpid/ch3/include/mpid_rma_shm.h
+++ b/src/mpid/ch3/include/mpid_rma_shm.h
@@ -261,7 +261,7 @@ static inline int MPIDI_CH3I_Shm_put_op(const void *origin_addr, int origin_coun
 
     if (win_ptr->shm_allocated == TRUE) {
         base = win_ptr->shm_base_addrs[target_rank];
-        disp_unit = win_ptr->disp_units[target_rank];
+        disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
     }
     else {
         base = win_ptr->base;
@@ -306,7 +306,7 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
     if (win_ptr->shm_allocated == TRUE) {
         shm_op = 1;
         base = win_ptr->shm_base_addrs[target_rank];
-        disp_unit = win_ptr->disp_units[target_rank];
+        disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
     }
     else {
         base = win_ptr->base;
@@ -460,7 +460,7 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
 
     if (win_ptr->shm_allocated == TRUE) {
         base = win_ptr->shm_base_addrs[target_rank];
-        disp_unit = win_ptr->disp_units[target_rank];
+        disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
         shm_locked = 1;
     }
@@ -625,7 +625,7 @@ static inline int MPIDI_CH3I_Shm_get_op(void *origin_addr, int origin_count,
 
     if (win_ptr->shm_allocated == TRUE) {
         base = win_ptr->shm_base_addrs[target_rank];
-        disp_unit = win_ptr->disp_units[target_rank];
+        disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
     }
     else {
         base = win_ptr->base;
@@ -667,7 +667,7 @@ static inline int MPIDI_CH3I_Shm_cas_op(const void *origin_addr, const void *com
 
     if (win_ptr->shm_allocated == TRUE) {
         base = win_ptr->shm_base_addrs[target_rank];
-        disp_unit = win_ptr->disp_units[target_rank];
+        disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
 
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
         shm_locked = 1;
@@ -724,7 +724,7 @@ static inline int MPIDI_CH3I_Shm_fop_op(const void *origin_addr, void *result_ad
 
     if (win_ptr->shm_allocated == TRUE) {
         base = win_ptr->shm_base_addrs[target_rank];
-        disp_unit = win_ptr->disp_units[target_rank];
+        disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
 
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
         shm_locked = 1;
diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index 9c2618d..bd5e274 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -291,6 +291,13 @@ struct MPIDI_Win_info_args {
 
 struct MPIDI_RMA_op;            /* forward decl from mpidrma.h */
 
+typedef struct MPIDI_Win_basic_info {
+    void *base_addr;
+    MPI_Aint size;
+    int disp_unit;
+    MPI_Win win_handle;
+} MPIDI_Win_basic_info_t;
+
 typedef struct MPIDI_RMA_Pkt_orderings {
     int flush_remote; /* ordered FLUSH, for remote completion */
     /* FIXME: in future we should also add local completin
@@ -302,21 +309,16 @@ extern MPIDI_RMA_Pkt_orderings_t *MPIDI_RMA_Pkt_orderings;
 #define MPIDI_DEV_WIN_DECL                                               \
     volatile int at_completion_counter;  /* completion counter for operations \
                                  targeting this window */                \
-    void **base_addrs;     /* array of base addresses of the windows of  \
-                              all processes */                           \
     void **shm_base_addrs; /* shared memory windows -- array of base     \
                               addresses of the windows of all processes  \
                               in this process's address space */         \
-    int *disp_units;      /* array of displacement units of all windows */\
-    MPI_Win *all_win_handles;    /* array of handles to the window objects\
-                                          of all processes */            \
+    MPIDI_Win_basic_info_t *basic_info_table;                            \
     volatile int current_lock_type;   /* current lock type on this window (as target)   \
                               * (none, shared, exclusive) */             \
     volatile int shared_lock_ref_cnt;                                    \
     struct MPIDI_RMA_Lock_entry volatile *lock_queue;  /* list of unsatisfied locks */  \
     struct MPIDI_RMA_Lock_entry volatile *lock_queue_tail; /* tail of unstaisfied locks. */ \
                                                                          \
-    MPI_Aint *sizes;      /* array of sizes of all windows */            \
     struct MPIDI_Win_info_args info_args;                                \
     int shm_allocated; /* flag: TRUE iff this window has a shared memory \
                           region associated with it */                   \
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index f7b3c4d..e689751 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -30,7 +30,7 @@ static inline int send_lock_msg(int dest, int lock_type, MPID_Win * win_ptr)
     MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr, dest, &vc);
 
     MPIDI_Pkt_init(lock_pkt, MPIDI_CH3_PKT_LOCK);
-    lock_pkt->target_win_handle = win_ptr->all_win_handles[dest];
+    lock_pkt->target_win_handle = win_ptr->basic_info_table[dest].win_handle;
     lock_pkt->source_win_handle = win_ptr->handle;
     lock_pkt->request_handle = MPI_REQUEST_NULL;
     lock_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
@@ -80,7 +80,7 @@ static inline int send_unlock_msg(int dest, MPID_Win * win_ptr, MPIDI_CH3_Pkt_fl
      * reply. Then do all the RMA ops. */
 
     MPIDI_Pkt_init(unlock_pkt, MPIDI_CH3_PKT_UNLOCK);
-    unlock_pkt->target_win_handle = win_ptr->all_win_handles[dest];
+    unlock_pkt->target_win_handle = win_ptr->basic_info_table[dest].win_handle;
     unlock_pkt->source_win_handle = win_ptr->handle;
     unlock_pkt->flags = flags;
 
@@ -250,7 +250,7 @@ static inline int send_decr_at_cnt_msg(int dst, MPID_Win * win_ptr)
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_SEND_DECR_AT_CNT_MSG);
 
     MPIDI_Pkt_init(decr_at_cnt_pkt, MPIDI_CH3_PKT_DECR_AT_COUNTER);
-    decr_at_cnt_pkt->target_win_handle = win_ptr->all_win_handles[dst];
+    decr_at_cnt_pkt->target_win_handle = win_ptr->basic_info_table[dst].win_handle;
 
     MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr, dst, &vc);
 
@@ -292,7 +292,7 @@ static inline int send_flush_msg(int dest, MPID_Win * win_ptr)
     MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr, dest, &vc);
 
     MPIDI_Pkt_init(flush_pkt, MPIDI_CH3_PKT_FLUSH);
-    flush_pkt->target_win_handle = win_ptr->all_win_handles[dest];
+    flush_pkt->target_win_handle = win_ptr->basic_info_table[dest].win_handle;
     flush_pkt->source_win_handle = win_ptr->handle;
 
     MPIU_THREAD_CS_ENTER(CH3COMM, vc);
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 962553b..4e96661 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -188,12 +188,12 @@ int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
             MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
         }
 
-        put_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
-            win_ptr->disp_units[target_rank] * target_disp;
+        put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
+            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
         put_pkt->count = target_count;
         put_pkt->datatype = target_datatype;
         put_pkt->info.dataloop_size = 0;
-        put_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+        put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
         put_pkt->source_win_handle = win_ptr->handle;
         put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
         if (use_immed_pkt) {
@@ -371,12 +371,12 @@ int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
 
         get_pkt = &(new_ptr->pkt.get);
         MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
-        get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
-            win_ptr->disp_units[target_rank] * target_disp;
+        get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
+            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
         get_pkt->count = target_count;
         get_pkt->datatype = target_datatype;
         get_pkt->info.dataloop_size = 0;
-        get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+        get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
         get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
         if (use_immed_resp_pkt)
             get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
@@ -558,13 +558,13 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
             MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
         }
 
-        accum_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
-            win_ptr->disp_units[target_rank] * target_disp;
+        accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
+            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
         accum_pkt->count = target_count;
         accum_pkt->datatype = target_datatype;
         accum_pkt->info.dataloop_size = 0;
         accum_pkt->op = op;
-        accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+        accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
         accum_pkt->source_win_handle = win_ptr->handle;
         accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
         if (use_immed_pkt) {
@@ -749,12 +749,12 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
 
             get_pkt = &(new_ptr->pkt.get);
             MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
-            get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
-                win_ptr->disp_units[target_rank] * target_disp;
+            get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
+                win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
             get_pkt->count = target_count;
             get_pkt->datatype = target_datatype;
             get_pkt->info.dataloop_size = 0;
-            get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+            get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
             get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
             if (use_immed_resp_pkt == TRUE)
                 get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
@@ -838,13 +838,13 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                 MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
             }
 
-            get_accum_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
-                win_ptr->disp_units[target_rank] * target_disp;
+            get_accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
+                win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
             get_accum_pkt->count = target_count;
             get_accum_pkt->datatype = target_datatype;
             get_accum_pkt->info.dataloop_size = 0;
             get_accum_pkt->op = op;
-            get_accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+            get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
             get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
             if (use_immed_pkt) {
                 void *src = (void *) origin_addr, *dest = (void *) (get_accum_pkt->info.data);
@@ -1095,10 +1095,10 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
 
         cas_pkt = &(new_ptr->pkt.cas);
         MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS_IMMED);
-        cas_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
-            win_ptr->disp_units[target_rank] * target_disp;
+        cas_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
+            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
         cas_pkt->datatype = datatype;
-        cas_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+        cas_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
         cas_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
 
         /* REQUIRE: All datatype arguments must be of the same, builtin
@@ -1251,12 +1251,12 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
 
             get_pkt = &(new_ptr->pkt.get);
             MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
-            get_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
-                win_ptr->disp_units[target_rank] * target_disp;
+            get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
+                win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
             get_pkt->count = 1;
             get_pkt->datatype = datatype;
             get_pkt->info.dataloop_size = 0;
-            get_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+            get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
             get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
             if (use_immed_resp_pkt == TRUE)
                 get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
@@ -1298,11 +1298,11 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             else {
                 MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);
             }
-            fop_pkt->addr = (char *) win_ptr->base_addrs[target_rank] +
-                win_ptr->disp_units[target_rank] * target_disp;
+            fop_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
+                win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
             fop_pkt->datatype = datatype;
             fop_pkt->op = op;
-            fop_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
+            fop_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
             fop_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
             if (use_immed_pkt) {
                 void *src = (void *) origin_addr, *dest = (void *) (fop_pkt->info.data);
diff --git a/src/mpid/ch3/src/ch3u_win_fns.c b/src/mpid/ch3/src/ch3u_win_fns.c
index 2f62809..575d0eb 100644
--- a/src/mpid/ch3/src/ch3u_win_fns.c
+++ b/src/mpid/ch3/src/ch3u_win_fns.c
@@ -42,7 +42,7 @@ int MPIDI_CH3U_Win_gather_info(void *base, MPI_Aint size, int disp_unit,
     int mpi_errno = MPI_SUCCESS, i, k, comm_size, rank;
     MPI_Aint *tmp_buf;
     mpir_errflag_t errflag = MPIR_ERR_NONE;
-    MPIU_CHKPMEM_DECL(5);
+    MPIU_CHKPMEM_DECL(1);
     MPIU_CHKLMEM_DECL(1);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO);
 
@@ -54,17 +54,9 @@ int MPIDI_CH3U_Win_gather_info(void *base, MPI_Aint size, int disp_unit,
     MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
     /* allocate memory for the base addresses, disp_units, and
      * completion counters of all processes */
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->base_addrs, void **,
-                        comm_size * sizeof(void *), mpi_errno, "(*win_ptr)->base_addrs");
-
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->sizes, MPI_Aint *, comm_size * sizeof(MPI_Aint),
-                        mpi_errno, "(*win_ptr)->sizes");
-
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->disp_units, int *, comm_size * sizeof(int),
-                        mpi_errno, "(*win_ptr)->disp_units");
-
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->all_win_handles, MPI_Win *,
-                        comm_size * sizeof(MPI_Win), mpi_errno, "(*win_ptr)->all_win_handles");
+    MPIU_CHKPMEM_MALLOC((*win_ptr)->basic_info_table, MPIDI_Win_basic_info_t *,
+                        comm_size * sizeof(MPIDI_Win_basic_info_t),
+                        mpi_errno, "(*win_ptr)->basic_info_table");
 
     /* get the addresses of the windows, window objects, and completion
      * counters of all processes.  allocate temp. buffer for communication */
@@ -89,10 +81,10 @@ int MPIDI_CH3U_Win_gather_info(void *base, MPI_Aint size, int disp_unit,
 
     k = 0;
     for (i = 0; i < comm_size; i++) {
-        (*win_ptr)->base_addrs[i] = MPIU_AintToPtr(tmp_buf[k++]);
-        (*win_ptr)->sizes[i] = tmp_buf[k++];
-        (*win_ptr)->disp_units[i] = (int) tmp_buf[k++];
-        (*win_ptr)->all_win_handles[i] = (MPI_Win) tmp_buf[k++];
+        (*win_ptr)->basic_info_table[i].base_addr = MPIU_AintToPtr(tmp_buf[k++]);
+        (*win_ptr)->basic_info_table[i].size = tmp_buf[k++];
+        (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++];
+        (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++];
     }
 
   fn_exit:
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index 28db10d..2e54aeb 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -305,10 +305,7 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->comm_ptr = win_comm_ptr;
 
     (*win_ptr)->at_completion_counter = 0;
-    /* (*win_ptr)->base_addrs[] is set by caller; */
-    /* (*win_ptr)->sizes[] is set by caller; */
-    /* (*win_ptr)->disp_units[] is set by caller; */
-    /* (*win_ptr)->all_win_handles[] is set by caller; */
+    /* (*win_ptr)->basic_info_table[] is set by caller; */
     (*win_ptr)->current_lock_type = MPID_LOCK_NONE;
     (*win_ptr)->shared_lock_ref_cnt = 0;
     (*win_ptr)->lock_queue = NULL;
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index 3828990..6ef5991 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -233,10 +233,7 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
     if (mpi_errno)
         MPIU_ERR_POP(mpi_errno);
 
-    MPIU_Free((*win_ptr)->base_addrs);
-    MPIU_Free((*win_ptr)->sizes);
-    MPIU_Free((*win_ptr)->disp_units);
-    MPIU_Free((*win_ptr)->all_win_handles);
+    MPIU_Free((*win_ptr)->basic_info_table);
     MPIU_Free((*win_ptr)->op_pool_start);
     MPIU_Free((*win_ptr)->target_pool_start);
     MPIU_Free((*win_ptr)->slots);

http://git.mpich.org/mpich.git/commitdiff/131e06ef98bee8437ad47570381ac3e5f5205f73

commit 131e06ef98bee8437ad47570381ac3e5f5205f73
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 18:45:02 2015 -0800

    Change name of MPIDI_CH3U_Win_create_gather to MPIDI_CH3U_Win_gather_info.
    
    Function MPIDI_CH3U_Win_create_gather exchanges the window
    information among processes. It does not create new window.
    Here we change the function name to a more suitable one.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index f9cb86e..88f5f1e 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -1145,7 +1145,7 @@ int MPIDI_CH3U_Win_create_dynamic(MPID_Info *info, MPID_Comm *comm, MPID_Win **w
 
 /* MPI RMA Utility functions */
 
-int MPIDI_CH3U_Win_create_gather(void *, MPI_Aint, int, MPID_Info *, MPID_Comm *,
+int MPIDI_CH3U_Win_gather_info(void *, MPI_Aint, int, MPID_Info *, MPID_Comm *,
                                  MPID_Win **);
 
 
diff --git a/src/mpid/ch3/src/ch3u_win_fns.c b/src/mpid/ch3/src/ch3u_win_fns.c
index a2aa76a..2f62809 100644
--- a/src/mpid/ch3/src/ch3u_win_fns.c
+++ b/src/mpid/ch3/src/ch3u_win_fns.c
@@ -33,20 +33,20 @@ int MPIDI_Win_fns_init(MPIDI_CH3U_Win_fns_t * win_fns)
 
 
 #undef FUNCNAME
-#define FUNCNAME MPIDI_CH3U_Win_create_gather
+#define FUNCNAME MPIDI_CH3U_Win_gather_info
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3U_Win_create_gather(void *base, MPI_Aint size, int disp_unit,
-                                 MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr)
+int MPIDI_CH3U_Win_gather_info(void *base, MPI_Aint size, int disp_unit,
+                               MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr)
 {
     int mpi_errno = MPI_SUCCESS, i, k, comm_size, rank;
     MPI_Aint *tmp_buf;
     mpir_errflag_t errflag = MPIR_ERR_NONE;
     MPIU_CHKPMEM_DECL(5);
     MPIU_CHKLMEM_DECL(1);
-    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_CREATE_GATHER);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO);
 
-    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE_GATHER);
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO);
 
     comm_size = (*win_ptr)->comm_ptr->local_size;
     rank = (*win_ptr)->comm_ptr->rank;
@@ -97,7 +97,7 @@ int MPIDI_CH3U_Win_create_gather(void *base, MPI_Aint size, int disp_unit,
 
   fn_exit:
     MPIU_CHKLMEM_FREEALL();
-    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_CREATE_GATHER);
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
@@ -119,7 +119,7 @@ int MPIDI_CH3U_Win_create(void *base, MPI_Aint size, int disp_unit, MPID_Info *
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE);
 
-    mpi_errno = MPIDI_CH3U_Win_create_gather(base, size, disp_unit, info, comm_ptr, win_ptr);
+    mpi_errno = MPIDI_CH3U_Win_gather_info(base, size, disp_unit, info, comm_ptr, win_ptr);
     if (mpi_errno != MPI_SUCCESS) {
         MPIU_ERR_POP(mpi_errno);
     }
@@ -154,7 +154,7 @@ int MPIDI_CH3U_Win_create_dynamic(MPID_Info * info, MPID_Comm * comm_ptr, MPID_W
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE_DYNAMIC);
 
-    mpi_errno = MPIDI_CH3U_Win_create_gather(MPI_BOTTOM, 0, 1, info, comm_ptr, win_ptr);
+    mpi_errno = MPIDI_CH3U_Win_gather_info(MPI_BOTTOM, 0, 1, info, comm_ptr, win_ptr);
     if (mpi_errno != MPI_SUCCESS) {
         MPIU_ERR_POP(mpi_errno);
     }
@@ -275,7 +275,7 @@ int MPIDI_CH3U_Win_allocate_no_shm(MPI_Aint size, int disp_unit, MPID_Info * inf
 
     (*win_ptr)->base = *base_pp;
 
-    mpi_errno = MPIDI_CH3U_Win_create_gather(*base_pp, size, disp_unit, info, comm_ptr, win_ptr);
+    mpi_errno = MPIDI_CH3U_Win_gather_info(*base_pp, size, disp_unit, info, comm_ptr, win_ptr);
     if (mpi_errno != MPI_SUCCESS) {
         MPIU_ERR_POP(mpi_errno);
     }

http://git.mpich.org/mpich.git/commitdiff/03d4c77b2148769436b8ce0b682b9b7a662baeef

commit 03d4c77b2148769436b8ce0b682b9b7a662baeef
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 14:01:44 2015 -0800

    Add CH3 APIs and macros to allow channel to implement Alloc_mem/Free_mem.
    
    Originally MPIDI_Alloc_mem(size, info) and MPIDI_Free_mem(base_ptr)
    in CH3 layer are implemented by calling MPIU_Malloc(size) and
    MPIU_Free(base_ptr) internally. This makes the underlying hardware
    be unable to develop a specific implementation of Alloc_mem and Free_mem,
    which is necessary when registering memory for RDMA operations.
    
    This patch defines new APIs, MPIDI_CH3I_Alloc_mem(size, info)
    and MPIDI_CH3I_Free_mem(base_ptr), to allow channels to implement
    their own memory allocators. If the channel does not have its own
    implementation, MPICH will fallback to the default implementation
    in CH3 layer which uses MPIU_Malloc and MPIU_Free.
    
    Thanks to Steffen Christgau <christgau at cs.uni-potsdam.de> for
    this contribution.
    
    Signed-off-by: Pavan Balaji <balaji at anl.gov>

diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h
index 6b02880..f9cb86e 100644
--- a/src/mpid/ch3/include/mpidimpl.h
+++ b/src/mpid/ch3/include/mpidimpl.h
@@ -1226,6 +1226,19 @@ int MPIDI_Win_sync(MPID_Win *win);
 void *MPIDI_Alloc_mem(size_t size, MPID_Info *info_ptr);
 int MPIDI_Free_mem(void *ptr);
 
+#ifdef MPIDI_CH3I_HAS_ALLOC_MEM
+void* MPIDI_CH3I_Alloc_mem(size_t size, MPID_Info *info_ptr);
+/* fallback to MPIU_Malloc if channel does not have its own RMA memory allocator */
+#else
+#define MPIDI_CH3I_Alloc_mem(size, info_ptr)    MPIU_Malloc(size)
+#endif
+
+#ifdef MPIDI_CH3I_HAS_FREE_MEM
+int MPIDI_CH3I_Free_mem(void *ptr);
+#else
+#define MPIDI_CH3I_Free_mem(ptr)    MPIU_Free(ptr);
+#endif
+
 /* Pvars */
 void MPIDI_CH3_RMA_Init_sync_pvars(void);
 void MPIDI_CH3_RMA_Init_pkthandler_pvars(void);
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index d55b7a9..3828990 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -304,7 +304,7 @@ void *MPIDI_Alloc_mem(size_t size, MPID_Info * info_ptr)
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_ALLOC_MEM);
 
-    ap = MPIU_Malloc(size);
+    ap = MPIDI_CH3I_Alloc_mem(size, info_ptr);
 
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_ALLOC_MEM);
     return ap;
@@ -322,7 +322,7 @@ int MPIDI_Free_mem(void *ptr)
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_FREE_MEM);
 
-    MPIU_Free(ptr);
+    MPIDI_CH3I_Free_mem(ptr);
 
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_FREE_MEM);
     return mpi_errno;

http://git.mpich.org/mpich.git/commitdiff/ee446c5c2972d7886ead5ac6e746c5d0a3a6f22d

commit ee446c5c2972d7886ead5ac6e746c5d0a3a6f22d
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Sun Mar 1 18:26:29 2015 -0800

    Clean up white-space and code format in RMA code.
    
    No reviewer.

diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
index 63c85f3..8a640d7 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
@@ -12,7 +12,8 @@
 #define FUNCNAME MPIDI_CH3_Win_shared_query
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_SHM_Win_shared_query(MPID_Win *win_ptr, int target_rank, MPI_Aint *size, int *disp_unit, void *baseptr)
+int MPIDI_CH3_SHM_Win_shared_query(MPID_Win * win_ptr, int target_rank, MPI_Aint * size,
+                                   int *disp_unit, void *baseptr)
 {
     int comm_size;
     int mpi_errno = MPI_SUCCESS;
@@ -28,30 +29,31 @@ int MPIDI_CH3_SHM_Win_shared_query(MPID_Win *win_ptr, int target_rank, MPI_Aint
         int i;
 
         /* Default, if no processes have size > 0. */
-        *size               = 0;
-        *disp_unit          = 0;
-        *((void**) baseptr) = NULL;
+        *size = 0;
+        *disp_unit = 0;
+        *((void **) baseptr) = NULL;
 
         for (i = 0; i < comm_size; i++) {
             if (win_ptr->sizes[i] > 0) {
-                *size               = win_ptr->sizes[i];
-                *disp_unit          = win_ptr->disp_units[i];
-                *((void**) baseptr) = win_ptr->shm_base_addrs[i];
+                *size = win_ptr->sizes[i];
+                *disp_unit = win_ptr->disp_units[i];
+                *((void **) baseptr) = win_ptr->shm_base_addrs[i];
                 break;
             }
         }
 
-    } else {
-        *size               = win_ptr->sizes[target_rank];
-        *disp_unit          = win_ptr->disp_units[target_rank];
-        *((void**) baseptr) = win_ptr->shm_base_addrs[target_rank];
+    }
+    else {
+        *size = win_ptr->sizes[target_rank];
+        *disp_unit = win_ptr->disp_units[target_rank];
+        *((void **) baseptr) = win_ptr->shm_base_addrs[target_rank];
     }
 
-fn_exit:
+  fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_WIN_SHARED_QUERY);
     return mpi_errno;
 
-fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -60,7 +62,7 @@ fn_fail:
 #define FUNCNAME MPIDI_CH3_SHM_Win_free
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_SHM_Win_free(MPID_Win **win_ptr)
+int MPIDI_CH3_SHM_Win_free(MPID_Win ** win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
     mpir_errflag_t errflag = MPIR_ERR_NONE;
@@ -87,9 +89,12 @@ int MPIDI_CH3_SHM_Win_free(MPID_Win **win_ptr)
              (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE) &&
             (*win_ptr)->shm_segment_len > 0) {
             /* detach from shared memory segment */
-            mpi_errno = MPIU_SHMW_Seg_detach((*win_ptr)->shm_segment_handle, (char **)&(*win_ptr)->shm_base_addr,
-                                         (*win_ptr)->shm_segment_len);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            mpi_errno =
+                MPIU_SHMW_Seg_detach((*win_ptr)->shm_segment_handle,
+                                     (char **) &(*win_ptr)->shm_base_addr,
+                                     (*win_ptr)->shm_segment_len);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
 
             MPIU_SHMW_Hnd_finalize(&(*win_ptr)->shm_segment_handle);
         }
@@ -104,9 +109,9 @@ int MPIDI_CH3_SHM_Win_free(MPID_Win **win_ptr)
         MPID_Comm *node_comm_ptr = NULL;
 
         /* When allocating shared memory region segment, we need comm of processes
-           that are on the same node as this process (node_comm).
-           If node_comm == NULL, this process is the only one on this node, therefore
-           we use comm_self as node comm. */
+         * that are on the same node as this process (node_comm).
+         * If node_comm == NULL, this process is the only one on this node, therefore
+         * we use comm_self as node comm. */
         node_comm_ptr = (*win_ptr)->comm_ptr->node_comm;
         MPIU_Assert(node_comm_ptr != NULL);
 
@@ -115,9 +120,11 @@ int MPIDI_CH3_SHM_Win_free(MPID_Win **win_ptr)
         }
 
         /* detach from shared memory segment */
-        mpi_errno = MPIU_SHMW_Seg_detach((*win_ptr)->shm_mutex_segment_handle, (char **)&(*win_ptr)->shm_mutex,
-                                         sizeof(MPIDI_CH3I_SHM_MUTEX));
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno =
+            MPIU_SHMW_Seg_detach((*win_ptr)->shm_mutex_segment_handle,
+                                 (char **) &(*win_ptr)->shm_mutex, sizeof(MPIDI_CH3I_SHM_MUTEX));
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
 
         MPIU_SHMW_Hnd_finalize(&(*win_ptr)->shm_mutex_segment_handle);
     }
@@ -129,12 +136,14 @@ int MPIDI_CH3_SHM_Win_free(MPID_Win **win_ptr)
     }
 
     mpi_errno = MPIDI_Win_free(win_ptr);
-    if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIU_ERR_POP(mpi_errno);
+    }
 
-fn_exit:
+  fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE);
     return mpi_errno;
 
-fn_fail:
+  fn_fail:
     goto fn_exit;
 }
diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
index 288213f..b6cfee9 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
@@ -18,8 +18,8 @@ MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_wincreate_allgather);
 
 MPIDI_SHM_Wins_list_t shm_wins_list;
 
-static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *info, MPID_Comm *comm_ptr,
-                                       void *base_ptr, MPID_Win **win_ptr);
+static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info * info,
+                                       MPID_Comm * comm_ptr, void *base_ptr, MPID_Win ** win_ptr);
 
 static int MPIDI_CH3I_Win_detect_shm(MPID_Win ** win_ptr);
 
@@ -27,7 +27,7 @@ static int MPIDI_CH3I_Win_detect_shm(MPID_Win ** win_ptr);
 #define FUNCNAME MPIDI_CH3_Win_fns_init
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns)
+int MPIDI_CH3_Win_fns_init(MPIDI_CH3U_Win_fns_t * win_fns)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_WIN_FNS_INIT);
@@ -87,7 +87,8 @@ static int MPIDI_CH3I_SHM_Wins_match(MPID_Win ** win_ptr, MPID_Win ** matched_wi
     }
 
     mpi_errno = MPIR_Comm_group_impl(node_comm_ptr, &node_group_ptr);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
 
     while (elem != NULL) {
         MPID_Win *shm_win = elem->win;
@@ -107,15 +108,18 @@ static int MPIDI_CH3I_SHM_Wins_match(MPID_Win ** win_ptr, MPID_Win ** matched_wi
             MPIDI_SHM_Wins_next_and_continue(elem);
 
         mpi_errno = MPIR_Comm_group_impl(shm_win->comm_ptr, &shm_node_group_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
 
         mpi_errno = MPIR_Group_translate_ranks_impl(node_group_ptr, node_size,
                                                     node_ranks, shm_node_group_ptr,
                                                     node_ranks_in_shm_node);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
 
         mpi_errno = MPIR_Group_free_impl(shm_node_group_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
         shm_node_group_ptr = NULL;
 
         group_diff = 0;
@@ -139,7 +143,8 @@ static int MPIDI_CH3I_SHM_Wins_match(MPID_Win ** win_ptr, MPID_Win ** matched_wi
             - (MPI_Aint) (shm_win->shm_base_addr);
         mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                         base_shm_offs, 1, MPI_AINT, node_comm_ptr, &errflag);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
         MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
         base_diff = 0;
@@ -211,7 +216,8 @@ static int MPIDI_CH3I_Win_detect_shm(MPID_Win ** win_ptr)
      * stored in every local process in the same order, hence the first matched
      * shared window on every local process should be the same. */
     mpi_errno = MPIDI_CH3I_SHM_Wins_match(win_ptr, &shm_win_ptr, &base_shm_offs);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
     if (shm_win_ptr == NULL)
         goto fn_exit;
 
@@ -255,13 +261,13 @@ static int MPIDI_CH3I_Win_detect_shm(MPID_Win ** win_ptr)
 #define FUNCNAME MPIDI_CH3I_Win_allocate_shm
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *info,
-                                       MPID_Comm *comm_ptr, void *base_ptr, MPID_Win **win_ptr)
+static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info * info,
+                                       MPID_Comm * comm_ptr, void *base_ptr, MPID_Win ** win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
     void **base_pp = (void **) base_ptr;
     int i, k, comm_size, rank;
-    int  node_size, node_rank;
+    int node_size, node_rank;
     MPID_Comm *node_comm_ptr;
     MPI_Aint *node_sizes;
     void **node_shm_base_addrs;
@@ -275,7 +281,8 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
 
     if ((*win_ptr)->comm_ptr->node_comm == NULL) {
-        mpi_errno = MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, base_ptr, win_ptr);
+        mpi_errno =
+            MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, base_ptr, win_ptr);
         goto fn_exit;
     }
 
@@ -285,12 +292,12 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
     (*win_ptr)->shm_allocated = TRUE;
 
     comm_size = (*win_ptr)->comm_ptr->local_size;
-    rank      = (*win_ptr)->comm_ptr->rank;
+    rank = (*win_ptr)->comm_ptr->rank;
 
     /* When allocating shared memory region segment, we need comm of processes
-       that are on the same node as this process (node_comm).
-       If node_comm == NULL, this process is the only one on this node, therefore
-       we use comm_self as node comm. */
+     * that are on the same node as this process (node_comm).
+     * If node_comm == NULL, this process is the only one on this node, therefore
+     * we use comm_self as node comm. */
     node_comm_ptr = (*win_ptr)->comm_ptr->node_comm;
     MPIU_Assert(node_comm_ptr != NULL);
     node_size = node_comm_ptr->local_size;
@@ -298,37 +305,36 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
 
     MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
     /* allocate memory for the base addresses, disp_units, and
-       completion counters of all processes */
+     * completion counters of all processes */
     MPIU_CHKPMEM_MALLOC((*win_ptr)->base_addrs, void **,
-                        comm_size*sizeof(void *),
-                        mpi_errno, "(*win_ptr)->base_addrs");
+                        comm_size * sizeof(void *), mpi_errno, "(*win_ptr)->base_addrs");
 
     MPIU_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **,
-                        comm_size*sizeof(void *),
-                        mpi_errno, "(*win_ptr)->shm_base_addrs");
+                        comm_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");
 
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->sizes, MPI_Aint *, comm_size*sizeof(MPI_Aint),
+    MPIU_CHKPMEM_MALLOC((*win_ptr)->sizes, MPI_Aint *, comm_size * sizeof(MPI_Aint),
                         mpi_errno, "(*win_ptr)->sizes");
 
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->disp_units, int *, comm_size*sizeof(int),
+    MPIU_CHKPMEM_MALLOC((*win_ptr)->disp_units, int *, comm_size * sizeof(int),
                         mpi_errno, "(*win_ptr)->disp_units");
 
     MPIU_CHKPMEM_MALLOC((*win_ptr)->all_win_handles, MPI_Win *,
-                        comm_size*sizeof(MPI_Win),
-                        mpi_errno, "(*win_ptr)->all_win_handles");
+                        comm_size * sizeof(MPI_Win), mpi_errno, "(*win_ptr)->all_win_handles");
 
     /* get the sizes of the windows and window objectsof
-       all processes.  allocate temp. buffer for communication */
-    MPIU_CHKLMEM_MALLOC(node_sizes, MPI_Aint *, node_size*sizeof(MPI_Aint), mpi_errno, "node_sizes");
+     * all processes.  allocate temp. buffer for communication */
+    MPIU_CHKLMEM_MALLOC(node_sizes, MPI_Aint *, node_size * sizeof(MPI_Aint), mpi_errno,
+                        "node_sizes");
 
     /* FIXME: This needs to be fixed for heterogeneous systems */
-    node_sizes[node_rank]   = (MPI_Aint) size;
+    node_sizes[node_rank] = (MPI_Aint) size;
 
     mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                     node_sizes, sizeof(MPI_Aint), MPI_BYTE,
                                     node_comm_ptr, &errflag);
     MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
     MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
     (*win_ptr)->shm_segment_len = 0;
@@ -346,176 +352,225 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
     }
 
     else {
-    mpi_errno = MPIU_SHMW_Hnd_init(&(*win_ptr)->shm_segment_handle);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-    if (node_rank == 0) {
-        char *serialized_hnd_ptr = NULL;
-
-        /* create shared memory region for all processes in win and map */
-        mpi_errno = MPIU_SHMW_Seg_create_and_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len,
-                                                    (char **)&(*win_ptr)->shm_base_addr, 0);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-        /* serialize handle and broadcast it to the other processes in win */
-        mpi_errno = MPIU_SHMW_Hnd_get_serialized_by_ref((*win_ptr)->shm_segment_handle, &serialized_hnd_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-        mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
-
-        /* wait for other processes to attach to win */
-        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
-
-        /* unlink shared memory region so it gets deleted when all processes exit */
-        mpi_errno = MPIU_SHMW_Seg_remove((*win_ptr)->shm_segment_handle);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-    } else {
-        char serialized_hnd[MPIU_SHMW_GHND_SZ] = {0};
-
-        /* get serialized handle from rank 0 and deserialize it */
-        mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+        mpi_errno = MPIU_SHMW_Hnd_init(&(*win_ptr)->shm_segment_handle);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+
+        if (node_rank == 0) {
+            char *serialized_hnd_ptr = NULL;
+
+            /* create shared memory region for all processes in win and map */
+            mpi_errno =
+                MPIU_SHMW_Seg_create_and_attach((*win_ptr)->shm_segment_handle,
+                                                (*win_ptr)->shm_segment_len,
+                                                (char **) &(*win_ptr)->shm_base_addr, 0);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+
+            /* serialize handle and broadcast it to the other processes in win */
+            mpi_errno =
+                MPIU_SHMW_Hnd_get_serialized_by_ref((*win_ptr)->shm_segment_handle,
+                                                    &serialized_hnd_ptr);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+
+            mpi_errno =
+                MPIR_Bcast_impl(serialized_hnd_ptr, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
+                                &errflag);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+            MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+
+            /* wait for other processes to attach to win */
+            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+            MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+
+            /* unlink shared memory region so it gets deleted when all processes exit */
+            mpi_errno = MPIU_SHMW_Seg_remove((*win_ptr)->shm_segment_handle);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
 
-        mpi_errno = MPIU_SHMW_Hnd_deserialize((*win_ptr)->shm_segment_handle, serialized_hnd, strlen(serialized_hnd));
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-        /* attach to shared memory region created by rank 0 */
-        mpi_errno = MPIU_SHMW_Seg_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len,
-                                         (char **)&(*win_ptr)->shm_base_addr, 0);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
-    }
-
-    /* Allocated the interprocess mutex segment. */
-    mpi_errno = MPIU_SHMW_Hnd_init(&(*win_ptr)->shm_mutex_segment_handle);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-    if (node_rank == 0) {
-        char *serialized_hnd_ptr = NULL;
-
-        /* create shared memory region for all processes in win and map */
-        mpi_errno = MPIU_SHMW_Seg_create_and_attach((*win_ptr)->shm_mutex_segment_handle, sizeof(MPIDI_CH3I_SHM_MUTEX),
-                                                    (char **)&(*win_ptr)->shm_mutex, 0);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-        MPIDI_CH3I_SHM_MUTEX_INIT(*win_ptr);
-
-        /* serialize handle and broadcast it to the other processes in win */
-        mpi_errno = MPIU_SHMW_Hnd_get_serialized_by_ref((*win_ptr)->shm_mutex_segment_handle, &serialized_hnd_ptr);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-        mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
-
-        /* wait for other processes to attach to win */
-        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
-
-        /* unlink shared memory region so it gets deleted when all processes exit */
-        mpi_errno = MPIU_SHMW_Seg_remove((*win_ptr)->shm_mutex_segment_handle);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-    } else {
-        char serialized_hnd[MPIU_SHMW_GHND_SZ] = {0};
-
-        /* get serialized handle from rank 0 and deserialize it */
-        mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
-
-        mpi_errno = MPIU_SHMW_Hnd_deserialize((*win_ptr)->shm_mutex_segment_handle, serialized_hnd, strlen(serialized_hnd));
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-        /* attach to shared memory region created by rank 0 */
-        mpi_errno = MPIU_SHMW_Seg_attach((*win_ptr)->shm_mutex_segment_handle, sizeof(MPIDI_CH3I_SHM_MUTEX),
-                                         (char **)&(*win_ptr)->shm_mutex, 0);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-
-        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-        MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
-    }
+        }
+        else {
+            char serialized_hnd[MPIU_SHMW_GHND_SZ] = { 0 };
+
+            /* get serialized handle from rank 0 and deserialize it */
+            mpi_errno =
+                MPIR_Bcast_impl(serialized_hnd, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
+                                &errflag);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+            MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+
+            mpi_errno =
+                MPIU_SHMW_Hnd_deserialize((*win_ptr)->shm_segment_handle, serialized_hnd,
+                                          strlen(serialized_hnd));
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+
+            /* attach to shared memory region created by rank 0 */
+            mpi_errno =
+                MPIU_SHMW_Seg_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len,
+                                     (char **) &(*win_ptr)->shm_base_addr, 0);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+
+            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+            MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+        }
 
-    /* compute the base addresses of each process within the shared memory segment */
-    {
-        char *cur_base;
-        int cur_rank;
-        if ((*win_ptr)->create_flavor != MPI_WIN_FLAVOR_SHARED) {
-            /* If create flavor is not MPI_WIN_FLAVOR_SHARED, all processes on this
-               window may not be on the same node. Because we only need to calculate
-               local processes' shm_base_addrs using local processes's sizes,
-               we allocate a temporary array to place results and copy results
-               back to shm_base_addrs on the window at last. */
-            MPIU_CHKLMEM_MALLOC(node_shm_base_addrs, void **, node_size*sizeof(void*),
-                                mpi_errno, "node_shm_base_addrs");
+        /* Allocated the interprocess mutex segment. */
+        mpi_errno = MPIU_SHMW_Hnd_init(&(*win_ptr)->shm_mutex_segment_handle);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
+
+        if (node_rank == 0) {
+            char *serialized_hnd_ptr = NULL;
+
+            /* create shared memory region for all processes in win and map */
+            mpi_errno =
+                MPIU_SHMW_Seg_create_and_attach((*win_ptr)->shm_mutex_segment_handle,
+                                                sizeof(MPIDI_CH3I_SHM_MUTEX),
+                                                (char **) &(*win_ptr)->shm_mutex, 0);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+
+            MPIDI_CH3I_SHM_MUTEX_INIT(*win_ptr);
+
+            /* serialize handle and broadcast it to the other processes in win */
+            mpi_errno =
+                MPIU_SHMW_Hnd_get_serialized_by_ref((*win_ptr)->shm_mutex_segment_handle,
+                                                    &serialized_hnd_ptr);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+
+            mpi_errno =
+                MPIR_Bcast_impl(serialized_hnd_ptr, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
+                                &errflag);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+            MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+
+            /* wait for other processes to attach to win */
+            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+            MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+
+            /* unlink shared memory region so it gets deleted when all processes exit */
+            mpi_errno = MPIU_SHMW_Seg_remove((*win_ptr)->shm_mutex_segment_handle);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
         }
         else {
-            node_shm_base_addrs = (*win_ptr)->shm_base_addrs;
+            char serialized_hnd[MPIU_SHMW_GHND_SZ] = { 0 };
+
+            /* get serialized handle from rank 0 and deserialize it */
+            mpi_errno =
+                MPIR_Bcast_impl(serialized_hnd, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
+                                &errflag);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+            MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+
+            mpi_errno =
+                MPIU_SHMW_Hnd_deserialize((*win_ptr)->shm_mutex_segment_handle, serialized_hnd,
+                                          strlen(serialized_hnd));
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+
+            /* attach to shared memory region created by rank 0 */
+            mpi_errno =
+                MPIU_SHMW_Seg_attach((*win_ptr)->shm_mutex_segment_handle,
+                                     sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex,
+                                     0);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+
+            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
+            MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
         }
 
-        cur_base = (*win_ptr)->shm_base_addr;
-        cur_rank = 0;
-        node_shm_base_addrs[0] = (*win_ptr)->shm_base_addr;
-        for (i = 1; i < node_size; ++i) {
-            if (node_sizes[i]) {
-                /* For the base addresses, we track the previous
-                 * process that has allocated non-zero bytes of shared
-                 * memory.  We can not simply use "i-1" for the
-                 * previous process because rank "i-1" might not have
-                 * allocated any memory. */
-                if (noncontig) {
-                    node_shm_base_addrs[i] = cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[cur_rank]);
-                } else {
-                    node_shm_base_addrs[i] = cur_base + node_sizes[cur_rank];
+        /* compute the base addresses of each process within the shared memory segment */
+        {
+            char *cur_base;
+            int cur_rank;
+            if ((*win_ptr)->create_flavor != MPI_WIN_FLAVOR_SHARED) {
+                /* If create flavor is not MPI_WIN_FLAVOR_SHARED, all processes on this
+                 * window may not be on the same node. Because we only need to calculate
+                 * local processes' shm_base_addrs using local processes's sizes,
+                 * we allocate a temporary array to place results and copy results
+                 * back to shm_base_addrs on the window at last. */
+                MPIU_CHKLMEM_MALLOC(node_shm_base_addrs, void **, node_size * sizeof(void *),
+                                    mpi_errno, "node_shm_base_addrs");
+            }
+            else {
+                node_shm_base_addrs = (*win_ptr)->shm_base_addrs;
+            }
+
+            cur_base = (*win_ptr)->shm_base_addr;
+            cur_rank = 0;
+            node_shm_base_addrs[0] = (*win_ptr)->shm_base_addr;
+            for (i = 1; i < node_size; ++i) {
+                if (node_sizes[i]) {
+                    /* For the base addresses, we track the previous
+                     * process that has allocated non-zero bytes of shared
+                     * memory.  We can not simply use "i-1" for the
+                     * previous process because rank "i-1" might not have
+                     * allocated any memory. */
+                    if (noncontig) {
+                        node_shm_base_addrs[i] =
+                            cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[cur_rank]);
+                    }
+                    else {
+                        node_shm_base_addrs[i] = cur_base + node_sizes[cur_rank];
+                    }
+                    cur_base = node_shm_base_addrs[i];
+                    cur_rank = i;
+                }
+                else {
+                    node_shm_base_addrs[i] = NULL;
                 }
-                cur_base = node_shm_base_addrs[i];
-                cur_rank = i;
-            } else {
-                node_shm_base_addrs[i] = NULL;
             }
-        }
 
-        if ((*win_ptr)->create_flavor != MPI_WIN_FLAVOR_SHARED) {
-            /* if MPI_WIN_FLAVOR_SHARED is not set, copy from node_shm_base_addrs to
-               (*win_ptr)->shm_base_addrs */
-            for (i = 0; i < comm_size; i++) {
-                if ((*win_ptr)->comm_ptr->intranode_table[i] >= 0) {
-                    MPIU_Assert((*win_ptr)->comm_ptr->intranode_table[i] < node_size);
-                    (*win_ptr)->shm_base_addrs[i] = node_shm_base_addrs[(*win_ptr)->comm_ptr->intranode_table[i]];
+            if ((*win_ptr)->create_flavor != MPI_WIN_FLAVOR_SHARED) {
+                /* if MPI_WIN_FLAVOR_SHARED is not set, copy from node_shm_base_addrs to
+                 * (*win_ptr)->shm_base_addrs */
+                for (i = 0; i < comm_size; i++) {
+                    if ((*win_ptr)->comm_ptr->intranode_table[i] >= 0) {
+                        MPIU_Assert((*win_ptr)->comm_ptr->intranode_table[i] < node_size);
+                        (*win_ptr)->shm_base_addrs[i] =
+                            node_shm_base_addrs[(*win_ptr)->comm_ptr->intranode_table[i]];
+                    }
+                    else
+                        (*win_ptr)->shm_base_addrs[i] = NULL;
                 }
-                else
-                    (*win_ptr)->shm_base_addrs[i] = NULL;
             }
         }
-    }
 
-    (*win_ptr)->base = (*win_ptr)->shm_base_addrs[rank];
+        (*win_ptr)->base = (*win_ptr)->shm_base_addrs[rank];
     }
 
-    MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4*comm_size*sizeof(MPI_Aint),
+    MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint),
                         mpi_errno, "tmp_buf");
 
     /* get the base addresses of the windows.  Note we reuse tmp_buf from above
-       since it's at least as large as we need it for this allgather. */
-    tmp_buf[4*rank] = MPIU_PtrToAint((*win_ptr)->base);
-    tmp_buf[4*rank+1] = size;
-    tmp_buf[4*rank+2] = (MPI_Aint) disp_unit;
-    tmp_buf[4*rank+3] = (MPI_Aint) (*win_ptr)->handle;
+     * since it's at least as large as we need it for this allgather. */
+    tmp_buf[4 * rank] = MPIU_PtrToAint((*win_ptr)->base);
+    tmp_buf[4 * rank + 1] = size;
+    tmp_buf[4 * rank + 2] = (MPI_Aint) disp_unit;
+    tmp_buf[4 * rank + 3] = (MPI_Aint) (*win_ptr)->handle;
 
     mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
-                                    tmp_buf, 4, MPI_AINT,
-                                    (*win_ptr)->comm_ptr, &errflag);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                                    tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
     MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
     k = 0;
@@ -530,17 +585,17 @@ static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info *
 
     /* Provide operation overrides for this window flavor */
     (*win_ptr)->RMAFns.Win_shared_query = MPIDI_CH3_SHM_Win_shared_query;
-    (*win_ptr)->RMAFns.Win_free         = MPIDI_CH3_SHM_Win_free;
+    (*win_ptr)->RMAFns.Win_free = MPIDI_CH3_SHM_Win_free;
 
     /* Cache SHM windows */
     MPIDI_CH3I_SHM_Wins_append(&shm_wins_list, (*win_ptr));
 
-fn_exit:
+  fn_exit:
     MPIU_CHKLMEM_FREEALL();
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
-fn_fail:
+  fn_fail:
     MPIU_CHKPMEM_REAP();
     goto fn_exit;
     /* --END ERROR HANDLING-- */
diff --git a/src/mpid/ch3/include/mpid_rma_issue.h b/src/mpid/ch3/include/mpid_rma_issue.h
index 47b2569..eda6617 100644
--- a/src/mpid/ch3/include/mpid_rma_issue.h
+++ b/src/mpid/ch3/include/mpid_rma_issue.h
@@ -44,10 +44,10 @@ static inline int immed_copy(void *src, void *dest, size_t len)
         MPIU_Memcpy(dest, (void *) src, len);
     }
 
- fn_exit:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_IMMED_COPY);
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -58,7 +58,7 @@ static inline int immed_copy(void *src, void *dest, size_t len)
 #define FUNCNAME fill_in_derived_dtp_info
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int fill_in_derived_dtp_info(MPIDI_RMA_Op_t *rma_op, MPID_Datatype *dtp)
+static int fill_in_derived_dtp_info(MPIDI_RMA_Op_t * rma_op, MPID_Datatype * dtp)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIU_CHKPMEM_DECL(1);
@@ -82,19 +82,18 @@ static int fill_in_derived_dtp_info(MPIDI_RMA_Op_t *rma_op, MPID_Datatype *dtp)
     rma_op->dtype_info.has_sticky_ub = dtp->has_sticky_ub;
     rma_op->dtype_info.has_sticky_lb = dtp->has_sticky_lb;
 
-    MPIU_CHKPMEM_MALLOC(rma_op->dataloop, void *, dtp->dataloop_size,
-                        mpi_errno, "dataloop");
+    MPIU_CHKPMEM_MALLOC(rma_op->dataloop, void *, dtp->dataloop_size, mpi_errno, "dataloop");
 
     MPIU_Memcpy(rma_op->dataloop, dtp->dataloop, dtp->dataloop_size);
     /* The dataloop can have undefined padding sections, so we need to let
      * valgrind know that it is OK to pass this data to writev later on. */
     MPL_VG_MAKE_MEM_DEFINED(rma_op->dataloop, dtp->dataloop_size);
 
- fn_exit:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_FILL_IN_DERIVED_DTP_INFO);
     MPIU_CHKPMEM_COMMIT();
     return mpi_errno;
- fn_fail:
+  fn_fail:
     MPIU_CHKPMEM_REAP();
     goto fn_exit;
 }
@@ -146,8 +145,7 @@ static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
     ints[0] = count;
 
     MPID_Datatype_get_ptr(combined_datatype, *combined_dtp);
-    mpi_errno = MPID_Datatype_set_contents(*combined_dtp, MPI_COMBINER_STRUCT,
-                                           count + 1,       /* ints (cnt,blklen) */
+    mpi_errno = MPID_Datatype_set_contents(*combined_dtp, MPI_COMBINER_STRUCT, count + 1,       /* ints (cnt,blklen) */
                                            count,       /* aints (disps) */
                                            count,       /* types */
                                            ints, displaces, datatypes);
@@ -185,7 +183,7 @@ static int create_datatype(const MPIDI_RMA_dtype_info * dtype_info,
 #define FUNCNAME issue_from_origin_buffer
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int issue_from_origin_buffer(MPIDI_RMA_Op_t *rma_op, MPID_IOV *iov, MPIDI_VC_t *vc)
+static int issue_from_origin_buffer(MPIDI_RMA_Op_t * rma_op, MPID_IOV * iov, MPIDI_VC_t * vc)
 {
     MPI_Aint origin_type_size;
     MPI_Datatype target_datatype;
@@ -202,7 +200,8 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t *rma_op, MPID_IOV *iov, MPIDI
 
         /* Fill derived datatype info. */
         mpi_errno = fill_in_derived_dtp_info(rma_op, target_dtp);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         /* Set dataloop size in pkt header */
         MPIDI_CH3_PKT_RMA_SET_DATALOOP_SIZE(rma_op->pkt, target_dtp->dataloop_size, mpi_errno);
@@ -276,9 +275,9 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t *rma_op, MPID_IOV *iov, MPIDI
         mpi_errno = create_datatype(&rma_op->dtype_info, rma_op->dataloop,
                                     target_dtp->dataloop_size,
                                     rma_op->origin_addr, rma_op->origin_count,
-                                    rma_op->origin_datatype,
-                                    &combined_dtp);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                                    rma_op->origin_datatype, &combined_dtp);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
 
         rma_op->request->dev.datatype_ptr = combined_dtp;
         /* combined_datatype will be freed when request is freed */
@@ -302,10 +301,10 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t *rma_op, MPID_IOV *iov, MPIDI
         MPID_Datatype_release(target_dtp);
     }
 
- fn_exit:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_ISSUE_FROM_ORIGIN_BUFFER);
     return mpi_errno;
- fn_fail:
+  fn_fail:
     if (rma_op->request) {
         if (rma_op->request->dev.datatype_ptr)
             MPID_Datatype_release(rma_op->request->dev.datatype_ptr);
@@ -321,9 +320,8 @@ static int issue_from_origin_buffer(MPIDI_RMA_Op_t *rma_op, MPID_IOV *iov, MPIDI
 #define FUNCNAME issue_put_op
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win *win_ptr,
-                        MPIDI_RMA_Target_t *target_ptr,
-                        MPIDI_CH3_Pkt_flags_t flags)
+static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
+                        MPIDI_RMA_Target_t * target_ptr, MPIDI_CH3_Pkt_flags_t flags)
 {
     MPIDI_VC_t *vc = NULL;
     MPI_Aint origin_type_size;
@@ -354,11 +352,12 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win *win_ptr,
         /* We still need to issue from origin buffer. */
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) put_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*put_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)rma_op->origin_addr);
+        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) rma_op->origin_addr);
         iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
 
         mpi_errno = issue_from_origin_buffer(rma_op, iov, vc);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
   fn_exit:
@@ -376,9 +375,8 @@ static int issue_put_op(MPIDI_RMA_Op_t * rma_op, MPID_Win *win_ptr,
 #define FUNCNAME issue_acc_op
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int issue_acc_op(MPIDI_RMA_Op_t *rma_op, MPID_Win *win_ptr,
-                        MPIDI_RMA_Target_t *target_ptr,
-                        MPIDI_CH3_Pkt_flags_t flags)
+static int issue_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
+                        MPIDI_RMA_Target_t * target_ptr, MPIDI_CH3_Pkt_flags_t flags)
 {
     MPIDI_VC_t *vc = NULL;
     MPI_Aint origin_type_size;
@@ -409,17 +407,18 @@ static int issue_acc_op(MPIDI_RMA_Op_t *rma_op, MPID_Win *win_ptr,
         /* We still need to issue from origin buffer. */
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) accum_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*accum_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)rma_op->origin_addr);
+        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) rma_op->origin_addr);
         iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
 
         mpi_errno = issue_from_origin_buffer(rma_op, iov, vc);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
- fn_exit:
+  fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_ACC_OP);
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -429,9 +428,8 @@ static int issue_acc_op(MPIDI_RMA_Op_t *rma_op, MPID_Win *win_ptr,
 #define FUNCNAME issue_get_acc_op
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int issue_get_acc_op(MPIDI_RMA_Op_t *rma_op, MPID_Win *win_ptr,
-                            MPIDI_RMA_Target_t *target_ptr,
-                            MPIDI_CH3_Pkt_flags_t flags)
+static int issue_get_acc_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
+                            MPIDI_RMA_Target_t * target_ptr, MPIDI_CH3_Pkt_flags_t flags)
 {
     MPIDI_VC_t *vc = NULL;
     MPI_Aint origin_type_size;
@@ -461,11 +459,11 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t *rma_op, MPID_Win *win_ptr,
     resp_req->dev.source_win_handle = win_ptr->handle;
 
     if (!MPIR_DATATYPE_IS_PREDEFINED(resp_req->dev.datatype)) {
-      MPID_Datatype *result_dtp = NULL;
-      MPID_Datatype_get_ptr(resp_req->dev.datatype, result_dtp);
-      resp_req->dev.datatype_ptr = result_dtp;
-      /* this will cause the datatype to be freed when the
-       * request is freed. */
+        MPID_Datatype *result_dtp = NULL;
+        MPID_Datatype_get_ptr(resp_req->dev.datatype, result_dtp);
+        resp_req->dev.datatype_ptr = result_dtp;
+        /* this will cause the datatype to be freed when the
+         * request is freed. */
     }
 
     /* Note: Get_accumulate uses the same packet type as accumulate */
@@ -480,7 +478,8 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t *rma_op, MPID_Win *win_ptr,
     if (rma_op->pkt.type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {
         /* All origin data is in packet header, issue the header. */
         MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-        mpi_errno = MPIDI_CH3_iStartMsg(vc, get_accum_pkt, sizeof(*get_accum_pkt), &(rma_op->request));
+        mpi_errno =
+            MPIDI_CH3_iStartMsg(vc, get_accum_pkt, sizeof(*get_accum_pkt), &(rma_op->request));
         MPIU_THREAD_CS_EXIT(CH3COMM, vc);
         MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
@@ -488,11 +487,12 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t *rma_op, MPID_Win *win_ptr,
         /* We still need to issue from origin buffer. */
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*get_accum_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)rma_op->origin_addr);
+        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) rma_op->origin_addr);
         iov[1].MPID_IOV_LEN = rma_op->origin_count * origin_type_size;
 
         mpi_errno = issue_from_origin_buffer(rma_op, iov, vc);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     /* This operation can generate two requests; one for inbound and one for
@@ -524,11 +524,11 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t *rma_op, MPID_Win *win_ptr,
     /* For error checking */
     resp_req = NULL;
 
- fn_exit:
+  fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_GET_ACC_OP);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
- fn_fail:
+  fn_fail:
     if (resp_req != NULL) {
         MPID_Request_release(resp_req);
     }
@@ -542,8 +542,7 @@ static int issue_get_acc_op(MPIDI_RMA_Op_t *rma_op, MPID_Win *win_ptr,
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
-                        MPIDI_RMA_Target_t *target_ptr,
-                        MPIDI_CH3_Pkt_flags_t flags)
+                        MPIDI_RMA_Target_t * target_ptr, MPIDI_CH3_Pkt_flags_t flags)
 {
     MPIDI_CH3_Pkt_get_t *get_pkt = &rma_op->pkt.get;
     int mpi_errno = MPI_SUCCESS;
@@ -600,7 +599,8 @@ static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
         MPID_Datatype_get_ptr(target_datatype, dtp);
 
         mpi_errno = fill_in_derived_dtp_info(rma_op, dtp);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         /* Set dataloop size in pkt header */
         MPIDI_CH3_PKT_RMA_SET_DATALOOP_SIZE(rma_op->pkt, dtp->dataloop_size, mpi_errno);
@@ -644,7 +644,7 @@ static int issue_get_op(MPIDI_RMA_Op_t * rma_op, MPID_Win * win_ptr,
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static int issue_cas_op(MPIDI_RMA_Op_t * rma_op,
-                        MPID_Win * win_ptr, MPIDI_RMA_Target_t *target_ptr,
+                        MPID_Win * win_ptr, MPIDI_RMA_Target_t * target_ptr,
                         MPIDI_CH3_Pkt_flags_t flags)
 {
     MPIDI_VC_t *vc = NULL;
@@ -707,7 +707,7 @@ static int issue_cas_op(MPIDI_RMA_Op_t * rma_op,
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
-                        MPID_Win * win_ptr, MPIDI_RMA_Target_t *target_ptr,
+                        MPID_Win * win_ptr, MPIDI_RMA_Target_t * target_ptr,
                         MPIDI_CH3_Pkt_flags_t flags)
 {
     MPIDI_VC_t *vc = NULL;
@@ -745,21 +745,22 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
     MPID_Datatype_get_size_macro(rma_op->origin_datatype, origin_type_size);
 
     if (rma_op->pkt.type == MPIDI_CH3_PKT_FOP_IMMED) {
-    /* All origin data is in packet header, issue the header. */
-    MPIU_THREAD_CS_ENTER(CH3COMM, vc);
-    mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_pkt, sizeof(*fop_pkt), &(rma_op->request));
-    MPIU_THREAD_CS_EXIT(CH3COMM, vc);
-    MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+        /* All origin data is in packet header, issue the header. */
+        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+        mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_pkt, sizeof(*fop_pkt), &(rma_op->request));
+        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+        MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     else {
         /* We still need to issue from origin buffer. */
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) fop_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*fop_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)rma_op->origin_addr);
+        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) rma_op->origin_addr);
         iov[1].MPID_IOV_LEN = origin_type_size;
 
         mpi_errno = issue_from_origin_buffer(rma_op, iov, vc);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     /* This operation can generate two requests; one for inbound and one for
@@ -790,11 +791,11 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
     /* For error checking */
     resp_req = NULL;
 
- fn_exit:
+  fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_ISSUE_FOP_OP);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
- fn_fail:
+  fn_fail:
     if (resp_req != NULL) {
         MPID_Request_release(resp_req);
     }
@@ -810,8 +811,7 @@ static int issue_fop_op(MPIDI_RMA_Op_t * rma_op,
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static inline int issue_rma_op(MPIDI_RMA_Op_t * op_ptr, MPID_Win * win_ptr,
-                               MPIDI_RMA_Target_t * target_ptr,
-                               MPIDI_CH3_Pkt_flags_t flags)
+                               MPIDI_RMA_Target_t * target_ptr, MPIDI_CH3_Pkt_flags_t flags)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_ISSUE_RMA_OP);
@@ -857,4 +857,4 @@ static inline int issue_rma_op(MPIDI_RMA_Op_t * op_ptr, MPID_Win * win_ptr,
     /* --END ERROR HANDLING-- */
 }
 
-#endif  /* MPID_RMA_ISSUE_H_INCLUDED */
+#endif /* MPID_RMA_ISSUE_H_INCLUDED */
diff --git a/src/mpid/ch3/include/mpid_rma_lockqueue.h b/src/mpid/ch3/include/mpid_rma_lockqueue.h
index 13f1290..dcc2d62 100644
--- a/src/mpid/ch3/include/mpid_rma_lockqueue.h
+++ b/src/mpid/ch3/include/mpid_rma_lockqueue.h
@@ -20,7 +20,7 @@ MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_winlock_getlocallock);
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static inline MPIDI_RMA_Lock_entry_t *MPIDI_CH3I_Win_lock_entry_alloc(MPID_Win * win_ptr,
-                                                                      MPIDI_CH3_Pkt_t *pkt)
+                                                                      MPIDI_CH3_Pkt_t * pkt)
 {
     MPIDI_RMA_Lock_entry_t *new_ptr = NULL;
 
@@ -48,7 +48,7 @@ static inline MPIDI_RMA_Lock_entry_t *MPIDI_CH3I_Win_lock_entry_alloc(MPID_Win *
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static inline int MPIDI_CH3I_Win_lock_entry_free(MPID_Win * win_ptr,
-                                                 MPIDI_RMA_Lock_entry_t *lock_entry)
+                                                 MPIDI_RMA_Lock_entry_t * lock_entry)
 {
     int mpi_errno = MPI_SUCCESS;
 
@@ -58,10 +58,10 @@ static inline int MPIDI_CH3I_Win_lock_entry_free(MPID_Win * win_ptr,
     }
 
     /* use PREPEND when return objects back to the pool
-       in order to improve cache performance */
+     * in order to improve cache performance */
     MPL_LL_PREPEND(win_ptr->lock_entry_pool, win_ptr->lock_entry_pool_tail, lock_entry);
 
     return mpi_errno;
 }
 
-#endif  /* MPID_RMA_ISSUE_H_INCLUDED */
+#endif /* MPID_RMA_ISSUE_H_INCLUDED */
diff --git a/src/mpid/ch3/include/mpid_rma_oplist.h b/src/mpid/ch3/include/mpid_rma_oplist.h
index 663591d..5085978 100644
--- a/src/mpid/ch3/include/mpid_rma_oplist.h
+++ b/src/mpid/ch3/include/mpid_rma_oplist.h
@@ -17,7 +17,8 @@ int MPIDI_CH3I_RMA_Make_progress_target(MPID_Win * win_ptr, int target_rank, int
 int MPIDI_CH3I_RMA_Make_progress_win(MPID_Win * win_ptr, int *made_progress);
 
 extern MPIDI_RMA_Op_t *global_rma_op_pool, *global_rma_op_pool_tail, *global_rma_op_pool_start;
-extern MPIDI_RMA_Target_t *global_rma_target_pool, *global_rma_target_pool_tail, *global_rma_target_pool_start;
+extern MPIDI_RMA_Target_t *global_rma_target_pool, *global_rma_target_pool_tail,
+    *global_rma_target_pool_start;
 
 MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_rmaqueue_alloc);
 
@@ -72,7 +73,7 @@ static inline int MPIDI_CH3I_Win_op_free(MPID_Win * win_ptr, MPIDI_RMA_Op_t * e)
      * at window free time, they won't conflict with the global pool
      * or other windows */
     /* use PREPEND when return objects back to the pool
-       in order to improve cache performance */
+     * in order to improve cache performance */
     if (e->pool_type == MPIDI_RMA_POOL_WIN)
         MPL_LL_PREPEND(win_ptr->op_pool, win_ptr->op_pool_tail, e);
     else
@@ -122,9 +123,9 @@ static inline MPIDI_RMA_Target_t *MPIDI_CH3I_Win_target_alloc(MPID_Win * win_ptr
 
     e->sync.sync_flag = MPIDI_RMA_SYNC_NONE;
     e->sync.outstanding_acks = 0;
-    e->sync.have_remote_incomplete_ops = 1; /* When I create a new target, there must be
-                                               incomplete ops until a FLUSH/UNLOCK packet
-                                               is sent. */
+    e->sync.have_remote_incomplete_ops = 1;     /* When I create a new target, there must be
+                                                 * incomplete ops until a FLUSH/UNLOCK packet
+                                                 * is sent. */
     return e;
 }
 
@@ -147,7 +148,7 @@ static inline int MPIDI_CH3I_Win_target_free(MPID_Win * win_ptr, MPIDI_RMA_Targe
     MPIU_Assert(e->pending_op_list == NULL);
 
     /* use PREPEND when return objects back to the pool
-       in order to improve cache performance */
+     * in order to improve cache performance */
     if (e->pool_type == MPIDI_RMA_POOL_WIN)
         MPL_LL_PREPEND(win_ptr->target_pool, win_ptr->target_pool_tail, e);
     else
@@ -163,7 +164,7 @@ static inline int MPIDI_CH3I_Win_target_free(MPID_Win * win_ptr, MPIDI_RMA_Targe
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static inline int MPIDI_CH3I_Win_create_target(MPID_Win * win_ptr, int target_rank,
-                                               MPIDI_RMA_Target_t **e)
+                                               MPIDI_RMA_Target_t ** e)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_RMA_Slot_t *slot = NULL;
@@ -177,7 +178,8 @@ static inline int MPIDI_CH3I_Win_create_target(MPID_Win * win_ptr, int target_ra
     t = MPIDI_CH3I_Win_target_alloc(win_ptr);
     if (t == NULL) {
         mpi_errno = MPIDI_CH3I_RMA_Cleanup_target_aggressive(win_ptr, &t);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     t->target_rank = target_rank;
@@ -192,9 +194,9 @@ static inline int MPIDI_CH3I_Win_create_target(MPID_Win * win_ptr, int target_ra
 
     (*e) = t;
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -205,7 +207,7 @@ static inline int MPIDI_CH3I_Win_create_target(MPID_Win * win_ptr, int target_ra
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 static inline int MPIDI_CH3I_Win_find_target(MPID_Win * win_ptr, int target_rank,
-                                             MPIDI_RMA_Target_t **e)
+                                             MPIDI_RMA_Target_t ** e)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_RMA_Slot_t *slot = NULL;
@@ -224,9 +226,9 @@ static inline int MPIDI_CH3I_Win_find_target(MPID_Win * win_ptr, int target_rank
 
     (*e) = t;
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -237,17 +239,18 @@ static inline int MPIDI_CH3I_Win_find_target(MPID_Win * win_ptr, int target_rank
 #define FUNCNAME MPIDI_CH3I_Win_enqueue_op
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_Win_enqueue_op(MPID_Win * win_ptr,
-                                            MPIDI_RMA_Op_t * op)
+static inline int MPIDI_CH3I_Win_enqueue_op(MPID_Win * win_ptr, MPIDI_RMA_Op_t * op)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_RMA_Target_t *target = NULL;
 
     mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, op->target_rank, &target);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
     if (target == NULL) {
         mpi_errno = MPIDI_CH3I_Win_create_target(win_ptr, op->target_rank, &target);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         if (win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
             win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_GRANTED) {
@@ -263,8 +266,8 @@ static inline int MPIDI_CH3I_Win_enqueue_op(MPID_Win * win_ptr,
         }
         else if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
             /* If global state is MPIDI_RMA_LOCK_ALL_CALLED, this must
-               the first time to create this target, set its access state
-               to MPIDI_RMA_LOCK_CALLED. */
+             * the first time to create this target, set its access state
+             * to MPIDI_RMA_LOCK_CALLED. */
             target->access_state = MPIDI_RMA_LOCK_CALLED;
             target->lock_type = MPI_LOCK_SHARED;
         }
@@ -279,9 +282,9 @@ static inline int MPIDI_CH3I_Win_enqueue_op(MPID_Win * win_ptr,
     target->accumulated_ops_cnt++;
     win_ptr->accumulated_ops_cnt++;
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -292,8 +295,7 @@ static inline int MPIDI_CH3I_Win_enqueue_op(MPID_Win * win_ptr,
 #define FUNCNAME MPIDI_CH3I_Win_target_dequeue_and_free
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_Win_target_dequeue_and_free(MPID_Win * win_ptr,
-                                                         MPIDI_RMA_Target_t * e)
+static inline int MPIDI_CH3I_Win_target_dequeue_and_free(MPID_Win * win_ptr, MPIDI_RMA_Target_t * e)
 {
     int mpi_errno = MPI_SUCCESS;
     int target_rank = e->target_rank;
@@ -307,14 +309,15 @@ static inline int MPIDI_CH3I_Win_target_dequeue_and_free(MPID_Win * win_ptr,
     MPL_LL_DELETE(slot->target_list, slot->target_list_tail, e);
 
     mpi_errno = MPIDI_CH3I_Win_target_free(win_ptr, e);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     if (slot->target_list == NULL)
         win_ptr->non_empty_slots--;
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -323,7 +326,7 @@ static inline int MPIDI_CH3I_Win_target_dequeue_and_free(MPID_Win * win_ptr,
 #define FUNCNAME MPIDI_CH3I_RMA_Cleanup_ops_target
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_RMA_Cleanup_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *target,
+static inline int MPIDI_CH3I_RMA_Cleanup_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
                                                     int *local_completed, int *remote_completed)
 {
     MPIDI_RMA_Op_t *curr_op = NULL;
@@ -347,9 +350,7 @@ static inline int MPIDI_CH3I_RMA_Cleanup_ops_target(MPID_Win * win_ptr, MPIDI_RM
         goto fn_exit;
 
     if (target->pending_op_list == NULL &&
-        target->read_op_list == NULL &&
-        target->write_op_list == NULL &&
-        target->dt_op_list == NULL)
+        target->read_op_list == NULL && target->write_op_list == NULL && target->dt_op_list == NULL)
         goto cleanup_target;
 
     if (target->read_op_list != NULL) {
@@ -438,8 +439,7 @@ static inline int MPIDI_CH3I_RMA_Cleanup_ops_target(MPID_Win * win_ptr, MPIDI_RM
          * target, see the MPIDI_RMA_Target definition in
          * mpid_rma_types.h */
         if (target->sync.sync_flag == MPIDI_RMA_SYNC_NONE &&
-            target->sync.outstanding_acks == 0 &&
-            target->sync.have_remote_incomplete_ops == 0) {
+            target->sync.outstanding_acks == 0 && target->sync.have_remote_incomplete_ops == 0) {
             (*remote_completed) = 1;
         }
     }
@@ -455,7 +455,7 @@ static inline int MPIDI_CH3I_RMA_Cleanup_ops_target(MPID_Win * win_ptr, MPIDI_RM
 #define FUNCNAME MPIDI_CH3I_RMA_Cleanup_ops_win
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_RMA_Cleanup_ops_win(MPID_Win *win_ptr,
+static inline int MPIDI_CH3I_RMA_Cleanup_ops_win(MPID_Win * win_ptr,
                                                  int *local_completed, int *remote_completed)
 {
     MPIDI_RMA_Target_t *target = NULL;
@@ -466,11 +466,12 @@ static inline int MPIDI_CH3I_RMA_Cleanup_ops_win(MPID_Win *win_ptr,
     (*remote_completed) = 0;
 
     for (i = 0; i < win_ptr->num_slots; i++) {
-        for (target = win_ptr->slots[i].target_list; target; ) {
+        for (target = win_ptr->slots[i].target_list; target;) {
             int local = 0, remote = 0;
 
             mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, target, &local, &remote);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
 
             num_targets++;
             local_completed_targets += local;
@@ -496,17 +497,19 @@ static inline int MPIDI_CH3I_RMA_Cleanup_ops_win(MPID_Win *win_ptr,
 #define FUNCNAME MPIDI_CH3I_RMA_Cleanup_single_target
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_RMA_Cleanup_single_target(MPID_Win *win_ptr, MPIDI_RMA_Target_t *target)
+static inline int MPIDI_CH3I_RMA_Cleanup_single_target(MPID_Win * win_ptr,
+                                                       MPIDI_RMA_Target_t * target)
 {
     int mpi_errno = MPI_SUCCESS;
 
     /* dequeue the target and free it. */
     mpi_errno = MPIDI_CH3I_Win_target_dequeue_and_free(win_ptr, target);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -515,25 +518,26 @@ static inline int MPIDI_CH3I_RMA_Cleanup_single_target(MPID_Win *win_ptr, MPIDI_
 #define FUNCNAME MPIDI_CH3I_RMA_Cleanup_targets_win
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_RMA_Cleanup_targets_win(MPID_Win *win_ptr)
+static inline int MPIDI_CH3I_RMA_Cleanup_targets_win(MPID_Win * win_ptr)
 {
     MPIDI_RMA_Target_t *target = NULL, *next_target = NULL;
     int i, mpi_errno = MPI_SUCCESS;
 
     for (i = 0; i < win_ptr->num_slots; i++) {
-        for (target = win_ptr->slots[i].target_list; target; ) {
+        for (target = win_ptr->slots[i].target_list; target;) {
             next_target = target->next;
             mpi_errno = MPIDI_CH3I_RMA_Cleanup_single_target(win_ptr, target);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
             target = next_target;
         }
     }
 
     MPIU_Assert(win_ptr->non_empty_slots == 0);
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -541,7 +545,7 @@ static inline int MPIDI_CH3I_RMA_Cleanup_targets_win(MPID_Win *win_ptr)
 #define FUNCNAME MPIDI_CH3I_Win_get_op
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_Win_get_op(MPID_Win * win_ptr, MPIDI_RMA_Op_t **e)
+static inline int MPIDI_CH3I_Win_get_op(MPID_Win * win_ptr, MPIDI_RMA_Op_t ** e)
 {
     MPIDI_RMA_Op_t *new_ptr = NULL;
     int local_completed = 0, remote_completed = 0;
@@ -551,37 +555,41 @@ static inline int MPIDI_CH3I_Win_get_op(MPID_Win * win_ptr, MPIDI_RMA_Op_t **e)
         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_alloc);
         new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_alloc);
-        if (new_ptr != NULL) break;
+        if (new_ptr != NULL)
+            break;
 
-        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr,
-                                                   &local_completed,
-                                                   &remote_completed);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed, &remote_completed);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_alloc);
         new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_alloc);
-        if (new_ptr != NULL) break;
+        if (new_ptr != NULL)
+            break;
 
         if (MPIDI_RMA_Pkt_orderings->flush_remote) {
             mpi_errno = MPIDI_CH3I_RMA_Free_ops_before_completion(win_ptr);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
         }
 
         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_alloc);
         new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_alloc);
-        if (new_ptr != NULL) break;
+        if (new_ptr != NULL)
+            break;
 
         mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     (*e) = new_ptr;
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -595,7 +603,8 @@ static inline int MPIDI_CH3I_Win_get_op(MPID_Win * win_ptr, MPIDI_RMA_Op_t **e)
 #define FUNCNAME MPIDI_CH3I_RMA_Ops_append
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline void MPIDI_CH3I_RMA_Ops_append(MPIDI_RMA_Ops_list_t * list, MPIDI_RMA_Ops_list_t * list_tail,
+static inline void MPIDI_CH3I_RMA_Ops_append(MPIDI_RMA_Ops_list_t * list,
+                                             MPIDI_RMA_Ops_list_t * list_tail,
                                              MPIDI_RMA_Op_t * elem)
 {
     MPL_LL_APPEND(*list, *list_tail, elem);
@@ -611,7 +620,8 @@ static inline void MPIDI_CH3I_RMA_Ops_append(MPIDI_RMA_Ops_list_t * list, MPIDI_
 #define FUNCNAME MPIDI_CH3I_RMA_Ops_unlink
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline void MPIDI_CH3I_RMA_Ops_unlink(MPIDI_RMA_Ops_list_t * list, MPIDI_RMA_Ops_list_t *list_tail,
+static inline void MPIDI_CH3I_RMA_Ops_unlink(MPIDI_RMA_Ops_list_t * list,
+                                             MPIDI_RMA_Ops_list_t * list_tail,
                                              MPIDI_RMA_Op_t * elem)
 {
     MPL_LL_DELETE(*list, *list_tail, elem);
diff --git a/src/mpid/ch3/include/mpid_rma_types.h b/src/mpid/ch3/include/mpid_rma_types.h
index a1060d7..73772e4 100644
--- a/src/mpid/ch3/include/mpid_rma_types.h
+++ b/src/mpid/ch3/include/mpid_rma_types.h
@@ -83,13 +83,13 @@ typedef struct MPIDI_RMA_Target {
     struct MPIDI_RMA_Target *next;
     int target_rank;
     enum MPIDI_RMA_states access_state;
-    int lock_type; /* NONE, SHARED, EXCLUSIVE */
+    int lock_type;              /* NONE, SHARED, EXCLUSIVE */
     int lock_mode;              /* e.g., MODE_NO_CHECK */
     int accumulated_ops_cnt;
     int disable_flush_local;
     int win_complete_flag;
-    int put_acc_issued; /* indicate if PUT/ACC is issued in this epoch
-                           after the previous synchronization calls. */
+    int put_acc_issued;         /* indicate if PUT/ACC is issued in this epoch
+                                 * after the previous synchronization calls. */
 
     /* The target structure is free to be cleaned up when all of the
      * following conditions hold true:
@@ -130,11 +130,11 @@ extern MPIDI_RMA_Win_list_t *MPIDI_RMA_Win_list, *MPIDI_RMA_Win_list_tail;
 
 typedef struct MPIDI_RMA_Lock_entry {
     struct MPIDI_RMA_Lock_entry *next;
-    MPIDI_CH3_Pkt_t pkt;    /* all information for this request packet */
+    MPIDI_CH3_Pkt_t pkt;        /* all information for this request packet */
     MPIDI_VC_t *vc;
-    void *data;             /* for queued PUTs / ACCs / GACCs, data is copied here */
+    void *data;                 /* for queued PUTs / ACCs / GACCs, data is copied here */
     int data_size;
-    int all_data_recved;    /* indicate if all data has been received */
+    int all_data_recved;        /* indicate if all data has been received */
 } MPIDI_RMA_Lock_entry_t;
 
 typedef MPIDI_RMA_Op_t *MPIDI_RMA_Ops_list_t;
diff --git a/src/mpid/ch3/include/mpidpkt.h b/src/mpid/ch3/include/mpidpkt.h
index aa2db5b..0c4ff9e 100644
--- a/src/mpid/ch3/include/mpidpkt.h
+++ b/src/mpid/ch3/include/mpidpkt.h
@@ -10,10 +10,10 @@
 #include "oputil.h"
 
 #ifdef HAVE_STDINT_H
-#  include <stdint.h>
+#include <stdint.h>
 #endif
 #ifdef HAVE_INTTYPES_H
-#  include <inttypes.h>
+#include <inttypes.h>
 #endif
 
 /* Enable the use of data within the message packet for small messages */
@@ -106,9 +106,9 @@ typedef enum {
     MPIDI_CH3_PKT_END_CH3,
     /* The channel can define additional types by defining the value
      * MPIDI_CH3_PKT_ENUM */
-# if defined(MPIDI_CH3_PKT_ENUM)
+#if defined(MPIDI_CH3_PKT_ENUM)
     MPIDI_CH3_PKT_ENUM,
-# endif
+#endif
     MPIDI_CH3_PKT_END_ALL,
     MPIDI_CH3_PKT_INVALID = -1  /* forces a signed enum to quash warnings */
 } MPIDI_CH3_Pkt_type_t;
@@ -607,9 +607,9 @@ typedef struct MPIDI_CH3_Pkt_get {
     MPI_Datatype datatype;
     struct {
         /* note that we use struct here in order
-           to consistently access dataloop_size
-           by "pkt->info.dataloop_size". */
-        int dataloop_size;          /* for derived datatypes */
+         * to consistently access dataloop_size
+         * by "pkt->info.dataloop_size". */
+        int dataloop_size;      /* for derived datatypes */
     } info;
     MPI_Request request_handle;
     MPI_Win target_win_handle;
@@ -624,8 +624,8 @@ typedef struct MPIDI_CH3_Pkt_get_resp {
     /* Followings are to piggyback IMMED data */
     struct {
         /* note that we use struct here in order
-           to consistently access data
-           by "pkt->info.data". */
+         * to consistently access data
+         * by "pkt->info.data". */
         char data[MPIDI_RMA_IMMED_BYTES];
     } info;
 } MPIDI_CH3_Pkt_get_resp_t;
@@ -669,8 +669,8 @@ typedef struct MPIDI_CH3_Pkt_get_accum_resp {
     /* Followings are to piggyback IMMED data */
     struct {
         /* note that we use struct here in order
-           to consistently access data
-           by "pkt->info.data". */
+         * to consistently access data
+         * by "pkt->info.data". */
         char data[MPIDI_RMA_IMMED_BYTES];
     } info;
 } MPIDI_CH3_Pkt_get_accum_resp_t;
@@ -691,8 +691,8 @@ typedef struct MPIDI_CH3_Pkt_cas_resp {
     MPI_Request request_handle;
     struct {
         /* note that we use struct here in order
-           to consistently access data
-           by "pkt->info.data". */
+         * to consistently access data
+         * by "pkt->info.data". */
         MPIDI_CH3_CAS_Immed_u data;
     } info;
     /* followings are used to decrement ack_counter at orign */
@@ -710,8 +710,8 @@ typedef struct MPIDI_CH3_Pkt_fop {
     MPI_Win target_win_handle;
     struct {
         /* note that we use struct here in order
-           to consistently access data
-           by "pkt->info.data". */
+         * to consistently access data
+         * by "pkt->info.data". */
         char data[MPIDI_RMA_IMMED_BYTES];
     } info;
 } MPIDI_CH3_Pkt_fop_t;
@@ -721,8 +721,8 @@ typedef struct MPIDI_CH3_Pkt_fop_resp {
     MPI_Request request_handle;
     struct {
         /* note that we use struct here in order
-           to consistently access data
-           by "pkt->info.data". */
+         * to consistently access data
+         * by "pkt->info.data". */
         char data[MPIDI_RMA_IMMED_BYTES];
     } info;
     /* followings are used to decrement ack_counter at orign */
@@ -735,11 +735,11 @@ typedef struct MPIDI_CH3_Pkt_lock {
     MPIDI_CH3_Pkt_flags_t flags;
     MPI_Win target_win_handle;
     /* Note that either source_win_handle
-       or request_handle will be used. Here
-       we need both of them because PUT/GET
-       may be converted to LOCK packet,
-       PUT has source_win_handle area and
-       GET has request_handle area. */
+     * or request_handle will be used. Here
+     * we need both of them because PUT/GET
+     * may be converted to LOCK packet,
+     * PUT has source_win_handle area and
+     * GET has request_handle area. */
     MPI_Win source_win_handle;
     MPI_Request request_handle;
 } MPIDI_CH3_Pkt_lock_t;
@@ -761,7 +761,7 @@ typedef struct MPIDI_CH3_Pkt_lock_ack {
     MPIDI_CH3_Pkt_type_t type;
     MPIDI_CH3_Pkt_flags_t flags;
     /* note that either source_win_handle
-       or request_handle is used. */
+     * or request_handle is used. */
     MPI_Win source_win_handle;
     MPI_Request request_handle;
     int target_rank;
@@ -771,7 +771,7 @@ typedef struct MPIDI_CH3_Pkt_lock_op_ack {
     MPIDI_CH3_Pkt_type_t type;
     MPIDI_CH3_Pkt_flags_t flags;
     /* note that either source_win_handle
-       or request_handle is used. */
+     * or request_handle is used. */
     MPI_Win source_win_handle;
     MPI_Request request_handle;
     int target_rank;
@@ -832,9 +832,9 @@ typedef union MPIDI_CH3_Pkt {
     MPIDI_CH3_Pkt_fop_resp_t fop_resp;
     MPIDI_CH3_Pkt_get_accum_resp_t get_accum_resp;
     MPIDI_CH3_Pkt_revoke_t revoke;
-# if defined(MPIDI_CH3_PKT_DECL)
+#if defined(MPIDI_CH3_PKT_DECL)
      MPIDI_CH3_PKT_DECL
-# endif
+#endif
 } MPIDI_CH3_Pkt_t;
 
 #if defined(MPID_USE_SEQUENCE_NUMBERS)
diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index c60124a..f7b3c4d 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -64,8 +64,7 @@ static inline int send_lock_msg(int dest, int lock_type, MPID_Win * win_ptr)
 #define FUNCNAME send_unlock_msg
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int send_unlock_msg(int dest, MPID_Win * win_ptr,
-                                  MPIDI_CH3_Pkt_flags_t flags)
+static inline int send_unlock_msg(int dest, MPID_Win * win_ptr, MPIDI_CH3_Pkt_flags_t flags)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_CH3_Pkt_t upkt;
@@ -203,13 +202,13 @@ static inline int MPIDI_CH3I_Send_lock_op_ack_pkt(MPIDI_VC_t * vc, MPID_Win * wi
 #define FUNCNAME MPIDI_CH3I_Send_flush_ack_pkt
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int MPIDI_CH3I_Send_flush_ack_pkt(MPIDI_VC_t *vc, MPID_Win *win_ptr,
-                                    MPI_Win source_win_handle)
+static inline int MPIDI_CH3I_Send_flush_ack_pkt(MPIDI_VC_t * vc, MPID_Win * win_ptr,
+                                                MPI_Win source_win_handle)
 {
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_flush_ack_t *flush_ack_pkt = &upkt.flush_ack;
     MPID_Request *req;
-    int mpi_errno=MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SEND_FLUSH_ACK_PKT);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SEND_FLUSH_ACK_PKT);
@@ -218,20 +217,19 @@ static inline int MPIDI_CH3I_Send_flush_ack_pkt(MPIDI_VC_t *vc, MPID_Win *win_pt
     flush_ack_pkt->source_win_handle = source_win_handle;
     flush_ack_pkt->target_rank = win_ptr->comm_ptr->rank;
 
-    /* Because this is in a packet handler, it is already within a critical section */	
+    /* Because this is in a packet handler, it is already within a critical section */
     /* MPIU_THREAD_CS_ENTER(CH3COMM,vc); */
     mpi_errno = MPIDI_CH3_iStartMsg(vc, flush_ack_pkt, sizeof(*flush_ack_pkt), &req);
     /* MPIU_THREAD_CS_EXIT(CH3COMM,vc); */
     if (mpi_errno != MPI_SUCCESS) {
-	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|rmamsg");
+        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
 
-    if (req != NULL)
-    {
+    if (req != NULL) {
         MPID_Request_release(req);
     }
 
- fn_fail:
+  fn_fail:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SEND_FLUSH_ACK_PKT);
     return mpi_errno;
 }
@@ -245,7 +243,7 @@ static inline int send_decr_at_cnt_msg(int dst, MPID_Win * win_ptr)
 {
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_decr_at_counter_t *decr_at_cnt_pkt = &upkt.decr_at_cnt;
-    MPIDI_VC_t * vc;
+    MPIDI_VC_t *vc;
     MPID_Request *request = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_SEND_DECR_AT_CNT_MSG);
@@ -256,12 +254,11 @@ static inline int send_decr_at_cnt_msg(int dst, MPID_Win * win_ptr)
 
     MPIDI_Comm_get_vc_set_active(win_ptr->comm_ptr, dst, &vc);
 
-    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
-    mpi_errno = MPIDI_CH3_iStartMsg(vc, decr_at_cnt_pkt,
-                                    sizeof(*decr_at_cnt_pkt), &request);
-    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
+    MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+    mpi_errno = MPIDI_CH3_iStartMsg(vc, decr_at_cnt_pkt, sizeof(*decr_at_cnt_pkt), &request);
+    MPIU_THREAD_CS_EXIT(CH3COMM, vc);
     if (mpi_errno != MPI_SUCCESS) {
-        MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|rmamsg" );
+        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
 
     if (request != NULL) {
@@ -319,10 +316,9 @@ static inline int send_flush_msg(int dest, MPID_Win * win_ptr)
 
 
 /* enqueue an unsatisfied origin in passive target at target side. */
-static inline int enqueue_lock_origin(MPID_Win *win_ptr, MPIDI_VC_t *vc,
-                                      MPIDI_CH3_Pkt_t *pkt,
-                                      MPIDI_msg_sz_t *buflen,
-                                      MPID_Request **reqp)
+static inline int enqueue_lock_origin(MPID_Win * win_ptr, MPIDI_VC_t * vc,
+                                      MPIDI_CH3_Pkt_t * pkt,
+                                      MPIDI_msg_sz_t * buflen, MPID_Request ** reqp)
 {
     MPIDI_RMA_Lock_entry_t *new_ptr = NULL;
     MPIDI_CH3_Pkt_flags_t flag;
@@ -347,8 +343,7 @@ static inline int enqueue_lock_origin(MPID_Win *win_ptr, MPIDI_VC_t *vc,
         pkt->type == MPIDI_CH3_PKT_ACCUMULATE_IMMED ||
         pkt->type == MPIDI_CH3_PKT_GET ||
         pkt->type == MPIDI_CH3_PKT_GET_ACCUM_IMMED ||
-        pkt->type == MPIDI_CH3_PKT_FOP_IMMED ||
-        pkt->type == MPIDI_CH3_PKT_CAS_IMMED) {
+        pkt->type == MPIDI_CH3_PKT_FOP_IMMED || pkt->type == MPIDI_CH3_PKT_CAS_IMMED) {
 
         /* return bytes of data processed in this pkt handler */
         (*buflen) = sizeof(MPIDI_CH3_Pkt_t);
@@ -377,8 +372,7 @@ static inline int enqueue_lock_origin(MPID_Win *win_ptr, MPIDI_VC_t *vc,
         recv_data_sz = type_size * target_count;
 
         if (new_ptr != NULL) {
-            if (win_ptr->current_lock_data_bytes + recv_data_sz
-                < MPIR_CVAR_CH3_RMA_LOCK_DATA_BYTES) {
+            if (win_ptr->current_lock_data_bytes + recv_data_sz < MPIR_CVAR_CH3_RMA_LOCK_DATA_BYTES) {
                 new_ptr->data = MPIU_Malloc(recv_data_sz);
             }
 
@@ -458,14 +452,16 @@ static inline int enqueue_lock_origin(MPID_Win *win_ptr, MPIDI_VC_t *vc,
         }
 
         mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         /* return bytes of data processed in this pkt handler */
         (*buflen) = sizeof(MPIDI_CH3_Pkt_t) + data_len;
 
         if (complete) {
             mpi_errno = MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete(vc, req, &complete);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
             if (complete) {
                 goto issue_ack;
             }
@@ -474,21 +470,28 @@ static inline int enqueue_lock_origin(MPID_Win *win_ptr, MPIDI_VC_t *vc,
         (*reqp) = req;
     }
 
- issue_ack:
+  issue_ack:
     if (pkt->type == MPIDI_CH3_PKT_LOCK) {
-        if (lock_discarded) flag = MPIDI_CH3_PKT_FLAG_RMA_LOCK_DISCARDED;
-        else flag = MPIDI_CH3_PKT_FLAG_RMA_LOCK_QUEUED_DATA_QUEUED;
+        if (lock_discarded)
+            flag = MPIDI_CH3_PKT_FLAG_RMA_LOCK_DISCARDED;
+        else
+            flag = MPIDI_CH3_PKT_FLAG_RMA_LOCK_QUEUED_DATA_QUEUED;
 
         MPIDI_CH3_PKT_RMA_GET_SOURCE_WIN_HANDLE((*pkt), source_win_handle, mpi_errno);
         MPIDI_CH3_PKT_RMA_GET_REQUEST_HANDLE((*pkt), request_handle, mpi_errno);
 
-        mpi_errno = MPIDI_CH3I_Send_lock_ack_pkt(vc, win_ptr, flag, source_win_handle, request_handle);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        mpi_errno =
+            MPIDI_CH3I_Send_lock_ack_pkt(vc, win_ptr, flag, source_win_handle, request_handle);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
     else {
-        if (lock_discarded) flag = MPIDI_CH3_PKT_FLAG_RMA_LOCK_DISCARDED;
-        else if (data_discarded) flag = MPIDI_CH3_PKT_FLAG_RMA_LOCK_QUEUED_DATA_DISCARDED;
-        else flag = MPIDI_CH3_PKT_FLAG_RMA_LOCK_QUEUED_DATA_QUEUED;
+        if (lock_discarded)
+            flag = MPIDI_CH3_PKT_FLAG_RMA_LOCK_DISCARDED;
+        else if (data_discarded)
+            flag = MPIDI_CH3_PKT_FLAG_RMA_LOCK_QUEUED_DATA_DISCARDED;
+        else
+            flag = MPIDI_CH3_PKT_FLAG_RMA_LOCK_QUEUED_DATA_QUEUED;
 
         if (pkt->type == MPIDI_CH3_PKT_PUT || pkt->type == MPIDI_CH3_PKT_PUT_IMMED ||
             pkt->type == MPIDI_CH3_PKT_ACCUMULATE || pkt->type == MPIDI_CH3_PKT_ACCUMULATE_IMMED) {
@@ -500,19 +503,20 @@ static inline int enqueue_lock_origin(MPID_Win *win_ptr, MPIDI_VC_t *vc,
             MPIDI_CH3_PKT_RMA_GET_REQUEST_HANDLE((*pkt), request_handle, mpi_errno);
         }
 
-        mpi_errno = MPIDI_CH3I_Send_lock_op_ack_pkt(vc, win_ptr, flag, source_win_handle, request_handle);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        mpi_errno =
+            MPIDI_CH3I_Send_lock_op_ack_pkt(vc, win_ptr, flag, source_win_handle, request_handle);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
 
-static inline int handle_lock_ack(MPID_Win *win_ptr, int target_rank,
-                                        MPIDI_CH3_Pkt_flags_t flags)
+static inline int handle_lock_ack(MPID_Win * win_ptr, int target_rank, MPIDI_CH3_Pkt_flags_t flags)
 {
     MPIDI_RMA_Target_t *t = NULL;
     int mpi_errno = MPI_SUCCESS;
@@ -534,7 +538,8 @@ static inline int handle_lock_ack(MPID_Win *win_ptr, int target_rank,
             else if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_DISCARDED) {
                 /* re-send lock request message. */
                 mpi_errno = send_lock_msg(target_rank, MPI_LOCK_SHARED, win_ptr);
-                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
             }
             goto fn_exit;
         }
@@ -547,13 +552,15 @@ static inline int handle_lock_ack(MPID_Win *win_ptr, int target_rank,
         else if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_DISCARDED) {
             /* re-send lock request message. */
             mpi_errno = send_lock_msg(target_rank, MPI_LOCK_SHARED, win_ptr);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
         }
         goto fn_exit;
     }
 
     mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, target_rank, &t);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
     MPIU_Assert(t != NULL);
 
     if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED)
@@ -562,26 +569,28 @@ static inline int handle_lock_ack(MPID_Win *win_ptr, int target_rank,
     if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_DISCARDED)
         t->access_state = MPIDI_RMA_LOCK_CALLED;
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
-static inline int adjust_op_piggybacked_with_lock (MPID_Win *win_ptr,
-                                                   int target_rank,
-                                                   MPIDI_CH3_Pkt_flags_t flags) {
+static inline int adjust_op_piggybacked_with_lock(MPID_Win * win_ptr,
+                                                  int target_rank, MPIDI_CH3_Pkt_flags_t flags)
+{
     MPIDI_RMA_Target_t *target = NULL;
     MPIDI_RMA_Op_t *op = NULL;
     MPIDI_CH3_Pkt_flags_t op_flags = MPIDI_CH3_PKT_FLAG_NONE;
     int mpi_errno = MPI_SUCCESS;
 
     mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, target_rank, &target);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
     MPIU_Assert(target != NULL);
 
     op = target->pending_op_list;
-    if (op != NULL) MPIDI_CH3_PKT_RMA_GET_FLAGS(op->pkt, op_flags, mpi_errno);
+    if (op != NULL)
+        MPIDI_CH3_PKT_RMA_GET_FLAGS(op->pkt, op_flags, mpi_errno);
 
     if (op_flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
         op_flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
@@ -619,7 +628,7 @@ static inline int adjust_op_piggybacked_with_lock (MPID_Win *win_ptr,
                 if (op->ureq) {
                     if (MPID_Request_is_complete(op->request)) {
                         /* Complete user request, let cleanup function to release
-                           ch3 ref */
+                         * ch3 ref */
                         MPID_Request_set_completed(op->ureq);
                     }
                     else {
@@ -637,8 +646,8 @@ static inline int adjust_op_piggybacked_with_lock (MPID_Win *win_ptr,
         else if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_QUEUED_DATA_DISCARDED ||
                  flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_DISCARDED) {
             /* We need to re-transmit this operation, so we destroy
-               the internal request and erase all flags in current
-               operation. */
+             * the internal request and erase all flags in current
+             * operation. */
             if (op->request) {
                 MPIDI_CH3_Request_destroy(op->request);
                 op->request = NULL;
@@ -650,9 +659,9 @@ static inline int adjust_op_piggybacked_with_lock (MPID_Win *win_ptr,
         }
     }
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -671,8 +680,9 @@ static inline int acquire_local_lock(MPID_Win * win_ptr, int lock_type)
 
     if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, lock_type) == 1) {
         mpi_errno = handle_lock_ack(win_ptr, win_ptr->comm_ptr->rank,
-                                          MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                                    MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
     }
     else {
         /* Queue the lock information. */
@@ -693,8 +703,9 @@ static inline int acquire_local_lock(MPID_Win * win_ptr, int lock_type)
         new_ptr = MPIDI_CH3I_Win_lock_entry_alloc(win_ptr, &pkt);
         if (new_ptr == NULL) {
             mpi_errno = handle_lock_ack(win_ptr, win_ptr->comm_ptr->rank,
-                                              MPIDI_CH3_PKT_FLAG_RMA_LOCK_DISCARDED);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                                        MPIDI_CH3_PKT_FLAG_RMA_LOCK_DISCARDED);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
             goto fn_exit;
         }
         MPL_LL_APPEND(win_ptr->lock_queue, win_ptr->lock_queue_tail, new_ptr);
@@ -725,16 +736,17 @@ static inline int MPIDI_CH3I_RMA_Handle_flush_ack(MPID_Win * win_ptr, int target
     MPIDI_RMA_Target_t *t;
 
     mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, target_rank, &t);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     t->sync.outstanding_acks--;
     MPIU_Assert(t->sync.outstanding_acks >= 0);
 
-    t->put_acc_issued = 0; /* reset PUT_ACC_FLAG after FLUSH is completed */
+    t->put_acc_issued = 0;      /* reset PUT_ACC_FLAG after FLUSH is completed */
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -752,37 +764,33 @@ static inline int do_accumulate_op(void *source_buf, void *target_buf,
 
     MPIDI_FUNC_ENTER(MPID_STATE_DO_ACCUMULATE_OP);
 
-    if (acc_op == MPI_REPLACE)
-    {
+    if (acc_op == MPI_REPLACE) {
         /* simply copy the data */
-        mpi_errno = MPIR_Localcopy(source_buf, acc_count, acc_dtp,
-                                   target_buf, acc_count, acc_dtp);
+        mpi_errno = MPIR_Localcopy(source_buf, acc_count, acc_dtp, target_buf, acc_count, acc_dtp);
         if (mpi_errno) {
-	    MPIU_ERR_POP(mpi_errno);
-	}
+            MPIU_ERR_POP(mpi_errno);
+        }
         goto fn_exit;
     }
 
-    if (HANDLE_GET_KIND(acc_op) == HANDLE_KIND_BUILTIN)
-    {
+    if (HANDLE_GET_KIND(acc_op) == HANDLE_KIND_BUILTIN) {
         /* get the function by indexing into the op table */
         uop = MPIR_OP_HDL_TO_FN(acc_op);
     }
-    else
-    {
-	/* --BEGIN ERROR HANDLING-- */
-        mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", acc_op );
+    else {
+        /* --BEGIN ERROR HANDLING-- */
+        mpi_errno =
+            MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OP,
+                                 "**opnotpredefined", "**opnotpredefined %d", acc_op);
         return mpi_errno;
-	/* --END ERROR HANDLING-- */
+        /* --END ERROR HANDLING-- */
     }
 
-    if (MPIR_DATATYPE_IS_PREDEFINED(acc_dtp))
-    {
-        (*uop)(source_buf, target_buf, &acc_count, &acc_dtp);
+    if (MPIR_DATATYPE_IS_PREDEFINED(acc_dtp)) {
+        (*uop) (source_buf, target_buf, &acc_count, &acc_dtp);
     }
-    else
-    {
-	/* derived datatype */
+    else {
+        /* derived datatype */
         MPID_Segment *segp;
         DLOOP_VECTOR *dloop_vec;
         MPI_Aint first, last;
@@ -792,63 +800,63 @@ static inline int do_accumulate_op(void *source_buf, void *target_buf,
         MPID_Datatype *dtp;
 
         segp = MPID_Segment_alloc();
-	/* --BEGIN ERROR HANDLING-- */
-        if (!segp)
-	{
-            mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0 );
-	    MPIDI_FUNC_EXIT(MPID_STATE_DO_ACCUMULATE_OP);
+        /* --BEGIN ERROR HANDLING-- */
+        if (!segp) {
+            mpi_errno =
+                MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
+                                     MPI_ERR_OTHER, "**nomem", 0);
+            MPIDI_FUNC_EXIT(MPID_STATE_DO_ACCUMULATE_OP);
             return mpi_errno;
         }
-	/* --END ERROR HANDLING-- */
-        MPID_Segment_init(NULL, acc_count,
-			  acc_dtp, segp, 0);
+        /* --END ERROR HANDLING-- */
+        MPID_Segment_init(NULL, acc_count, acc_dtp, segp, 0);
         first = 0;
-        last  = SEGMENT_IGNORE_LAST;
+        last = SEGMENT_IGNORE_LAST;
 
         MPID_Datatype_get_ptr(acc_dtp, dtp);
         vec_len = dtp->max_contig_blocks * acc_count + 1;
         /* +1 needed because Rob says so */
         dloop_vec = (DLOOP_VECTOR *)
             MPIU_Malloc(vec_len * sizeof(DLOOP_VECTOR));
-	/* --BEGIN ERROR HANDLING-- */
-        if (!dloop_vec)
-	{
-            mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0 );
-	    MPIDI_FUNC_EXIT(MPID_STATE_DO_ACCUMULATE_OP);
+        /* --BEGIN ERROR HANDLING-- */
+        if (!dloop_vec) {
+            mpi_errno =
+                MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
+                                     MPI_ERR_OTHER, "**nomem", 0);
+            MPIDI_FUNC_EXIT(MPID_STATE_DO_ACCUMULATE_OP);
             return mpi_errno;
         }
-	/* --END ERROR HANDLING-- */
+        /* --END ERROR HANDLING-- */
 
         MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);
 
         type = dtp->eltype;
         MPID_Datatype_get_size_macro(type, type_size);
-        for (i=0; i<vec_len; i++)
-	{
-            MPIU_Assign_trunc(count, (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size, int);
-            (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
-                   (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
-                   &count, &type);
+        for (i = 0; i < vec_len; i++) {
+            MPIU_Assign_trunc(count, (dloop_vec[i].DLOOP_VECTOR_LEN) / type_size, int);
+            (*uop) ((char *) source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
+                    (char *) target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
+                    &count, &type);
         }
 
         MPID_Segment_free(segp);
         MPIU_Free(dloop_vec);
     }
 
- fn_exit:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_DO_ACCUMULATE_OP);
 
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
 
-static inline int check_piggyback_lock(MPID_Win *win_ptr, MPIDI_VC_t *vc,
-                                       MPIDI_CH3_Pkt_t *pkt,
-                                       MPIDI_msg_sz_t *buflen,
-                                       int *acquire_lock_fail,
-                                       MPID_Request **reqp) {
+static inline int check_piggyback_lock(MPID_Win * win_ptr, MPIDI_VC_t * vc,
+                                       MPIDI_CH3_Pkt_t * pkt,
+                                       MPIDI_msg_sz_t * buflen,
+                                       int *acquire_lock_fail, MPID_Request ** reqp)
+{
     int lock_type;
     MPIDI_CH3_Pkt_flags_t flags;
     int mpi_errno = MPI_SUCCESS;
@@ -857,8 +865,7 @@ static inline int check_piggyback_lock(MPID_Win *win_ptr, MPIDI_VC_t *vc,
     (*reqp) = NULL;
 
     MPIDI_CH3_PKT_RMA_GET_FLAGS((*pkt), flags, mpi_errno);
-    if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
-        flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
+    if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED || flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
 
         if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED)
             lock_type = MPI_LOCK_SHARED;
@@ -870,21 +877,22 @@ static inline int check_piggyback_lock(MPID_Win *win_ptr, MPIDI_VC_t *vc,
         if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, lock_type) == 0) {
             /* cannot acquire the lock, queue up this operation. */
             mpi_errno = enqueue_lock_origin(win_ptr, vc, pkt, buflen, reqp);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
             (*acquire_lock_fail) = 1;
         }
     }
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
-static inline int finish_op_on_target(MPID_Win *win_ptr, MPIDI_VC_t *vc,
+static inline int finish_op_on_target(MPID_Win * win_ptr, MPIDI_VC_t * vc,
                                       int has_response_data,
-                                      MPIDI_CH3_Pkt_flags_t flags,
-                                      MPI_Win source_win_handle) {
+                                      MPIDI_CH3_Pkt_flags_t flags, MPI_Win source_win_handle)
+{
     int mpi_errno = MPI_SUCCESS;
 
     if (!has_response_data) {
@@ -892,25 +900,24 @@ static inline int finish_op_on_target(MPID_Win *win_ptr, MPIDI_VC_t *vc,
         if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
             flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
             MPIDI_CH3_Pkt_flags_t pkt_flags = MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
-            if ((flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
-                (flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
+            if ((flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) || (flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
                 pkt_flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
             MPIU_Assert(source_win_handle != MPI_WIN_NULL);
             mpi_errno = MPIDI_CH3I_Send_lock_op_ack_pkt(vc, win_ptr,
                                                         pkt_flags,
-                                                        source_win_handle,
-                                                        MPI_REQUEST_NULL);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                                                        source_win_handle, MPI_REQUEST_NULL);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
             MPIDI_CH3_Progress_signal_completion();
         }
         if (flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) {
             if (!(flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
                   flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE)) {
                 /* If op is piggybacked with both LOCK and FLUSH,
-                   we only send LOCK ACK back, do not send FLUSH ACK. */
-                mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr,
-                                                          source_win_handle);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                 * we only send LOCK ACK back, do not send FLUSH ACK. */
+                mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, source_win_handle);
+                if (mpi_errno)
+                    MPIU_ERR_POP(mpi_errno);
             }
             MPIDI_CH3_Progress_signal_completion();
         }
@@ -925,13 +932,14 @@ static inline int finish_op_on_target(MPID_Win *win_ptr, MPIDI_VC_t *vc,
             if (!(flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
                   flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE)) {
                 /* If op is piggybacked with both LOCK and UNLOCK,
-                   we only send LOCK ACK back, do not send FLUSH (UNLOCK) ACK. */
-                mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr,
-                                                          source_win_handle);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                 * we only send LOCK ACK back, do not send FLUSH (UNLOCK) ACK. */
+                mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, source_win_handle);
+                if (mpi_errno)
+                    MPIU_ERR_POP(mpi_errno);
             }
             mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
             MPIDI_CH3_Progress_signal_completion();
         }
     }
@@ -940,7 +948,8 @@ static inline int finish_op_on_target(MPID_Win *win_ptr, MPIDI_VC_t *vc,
 
         if (flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK) {
             mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
             MPIDI_CH3_Progress_signal_completion();
         }
 
@@ -953,14 +962,14 @@ static inline int finish_op_on_target(MPID_Win *win_ptr, MPIDI_VC_t *vc,
         }
     }
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
 
-static inline int fill_ranks_in_win_grp(MPID_Win *win_ptr, MPID_Group *group_ptr,
+static inline int fill_ranks_in_win_grp(MPID_Win * win_ptr, MPID_Group * group_ptr,
                                         int *ranks_in_win_grp)
 {
     int mpi_errno = MPI_SUCCESS;
@@ -973,23 +982,27 @@ static inline int fill_ranks_in_win_grp(MPID_Win *win_ptr, MPID_Group *group_ptr
 
     MPIU_CHKLMEM_MALLOC(ranks_in_grp, int *, group_ptr->size * sizeof(int),
                         mpi_errno, "ranks_in_grp");
-    for (i = 0; i < group_ptr->size; i++) ranks_in_grp[i] = i;
+    for (i = 0; i < group_ptr->size; i++)
+        ranks_in_grp[i] = i;
 
     mpi_errno = MPIR_Comm_group_impl(win_ptr->comm_ptr, &win_grp_ptr);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     mpi_errno = MPIR_Group_translate_ranks_impl(group_ptr, group_ptr->size,
                                                 ranks_in_grp, win_grp_ptr, ranks_in_win_grp);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     mpi_errno = MPIR_Group_free_impl(win_grp_ptr);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
   fn_exit:
     MPIU_CHKLMEM_FREEALL();
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_FILL_RANKS_IN_WIN_GRP);
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
diff --git a/src/mpid/ch3/src/ch3u_handle_op_req.c b/src/mpid/ch3/src/ch3u_handle_op_req.c
index 1109b6f..9efb9fd 100644
--- a/src/mpid/ch3/src/ch3u_handle_op_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_op_req.c
@@ -40,4 +40,3 @@ int MPIDI_CH3_ReqHandler_ReqOpsComplete(MPIDI_VC_t * vc, MPID_Request * sreq, in
   fn_fail:
     goto fn_exit;
 }
-
diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index d1ae95d..8b04b1b 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -13,12 +13,11 @@ static int create_derived_datatype(MPID_Request * rreq, MPID_Datatype ** dtp);
 #define FUNCNAME MPIDI_CH3U_Handle_recv_req
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3U_Handle_recv_req(MPIDI_VC_t * vc, MPID_Request * rreq, 
-			       int * complete)
+int MPIDI_CH3U_Handle_recv_req(MPIDI_VC_t * vc, MPID_Request * rreq, int *complete)
 {
     static int in_routine ATTRIBUTE((unused)) = FALSE;
     int mpi_errno = MPI_SUCCESS;
-    int (*reqFn)(MPIDI_VC_t *, MPID_Request *, int *);
+    int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_HANDLE_RECV_REQ);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_HANDLE_RECV_REQ);
@@ -28,12 +27,12 @@ int MPIDI_CH3U_Handle_recv_req(MPIDI_VC_t * vc, MPID_Request * rreq,
 
     reqFn = rreq->dev.OnDataAvail;
     if (!reqFn) {
-	MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_RECV);
-	MPIDI_CH3U_Request_complete(rreq);
-	*complete = TRUE;
+        MPIU_Assert(MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_RECV);
+        MPIDI_CH3U_Request_complete(rreq);
+        *complete = TRUE;
     }
     else {
-        mpi_errno = reqFn( vc, rreq, complete );
+        mpi_errno = reqFn(vc, rreq, complete);
     }
 
     in_routine = FALSE;
@@ -42,27 +41,26 @@ int MPIDI_CH3U_Handle_recv_req(MPIDI_VC_t * vc, MPID_Request * rreq,
 }
 
 /* ----------------------------------------------------------------------- */
-/* Here are the functions that implement the actions that are taken when 
+/* Here are the functions that implement the actions that are taken when
  * data is available for a receive request (or other completion operations)
  * These include "receive" requests that are part of the RMA implementation.
  *
  * The convention for the names of routines that are called when data is
  * available is
- *    MPIDI_CH3_ReqHandler_<type>( MPIDI_VC_t *, MPID_Request *, int * )
- * as in 
+ *    MPIDI_CH3_ReqHandler_<type>(MPIDI_VC_t *, MPID_Request *, int *)
+ * as in
  *    MPIDI_CH3_ReqHandler_...
  *
- * ToDo: 
+ * ToDo:
  *    We need a way for each of these functions to describe what they are,
  *    so that given a pointer to one of these functions, we can retrieve
- *    a description of the routine.  We may want to use a static string 
+ *    a description of the routine.  We may want to use a static string
  *    and require the user to maintain thread-safety, at least while
  *    accessing the string.
  */
 /* ----------------------------------------------------------------------- */
-int MPIDI_CH3_ReqHandler_RecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
-				       MPID_Request *rreq, 
-				       int *complete )
+int MPIDI_CH3_ReqHandler_RecvComplete(MPIDI_VC_t * vc ATTRIBUTE((unused)),
+                                      MPID_Request * rreq, int *complete)
 {
     /* mark data transfer as complete and decrement CC */
     MPIDI_CH3U_Request_complete(rreq);
@@ -74,9 +72,7 @@ int MPIDI_CH3_ReqHandler_RecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
 #define FUNCNAME MPIDI_CH3_ReqHandler_PutRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *vc,
-                                          MPID_Request *rreq,
-                                          int *complete )
+int MPIDI_CH3_ReqHandler_PutRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Win *win_ptr;
@@ -87,18 +83,18 @@ int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *vc,
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRECVCOMPLETE);
 
     /* NOTE: It is possible that this request is already completed before
-       entering this handler. This happens when this req handler is called
-       within the same req handler on the same request.
-       Consider this case: req is queued up in SHM queue with ref count of 2:
-       one is for completing the request and another is for dequeueing from
-       the queue. The first called req handler on this request completed
-       this request and decrement ref counter to 1. Request is still in the
-       queue. Within this handler, we call the req handler on the same request
-       for the second time (for example when making progress on SHM queue),
-       and the second called handler also tries to complete this request,
-       which leads to wrong execution.
-       Here we check if req is already completed to prevent processing the
-       same request twice. */
+     * entering this handler. This happens when this req handler is called
+     * within the same req handler on the same request.
+     * Consider this case: req is queued up in SHM queue with ref count of 2:
+     * one is for completing the request and another is for dequeueing from
+     * the queue. The first called req handler on this request completed
+     * this request and decrement ref counter to 1. Request is still in the
+     * queue. Within this handler, we call the req handler on the same request
+     * for the second time (for example when making progress on SHM queue),
+     * and the second called handler also tries to complete this request,
+     * which leads to wrong execution.
+     * Here we check if req is already completed to prevent processing the
+     * same request twice. */
     if (MPID_Request_is_complete(rreq)) {
         *complete = FALSE;
         goto fn_exit;
@@ -110,21 +106,22 @@ int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *vc,
     MPIDI_CH3U_Request_complete(rreq);
 
     /* NOTE: finish_op_on_target() must be called after we complete this request,
-       because inside finish_op_on_target() we may call this request handler
-       on the same request again (in release_lock()). Marking this request as
-       completed will prevent us from processing the same request twice. */
-    mpi_errno = finish_op_on_target(win_ptr, vc, FALSE /* has no response data */,
+     * because inside finish_op_on_target() we may call this request handler
+     * on the same request again (in release_lock()). Marking this request as
+     * completed will prevent us from processing the same request twice. */
+    mpi_errno = finish_op_on_target(win_ptr, vc, FALSE /* has no response data */ ,
                                     flags, source_win_handle);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     *complete = TRUE;
 
- fn_exit:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTRECVCOMPLETE);
     return MPI_SUCCESS;
 
     /* --BEGIN ERROR HANDLING-- */
- fn_fail:
+  fn_fail:
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -134,9 +131,7 @@ int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *vc,
 #define FUNCNAME MPIDI_CH3_ReqHandler_AccumRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *vc,
-                                            MPID_Request *rreq,
-                                            int *complete )
+int MPIDI_CH3_ReqHandler_AccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPI_Aint true_lb, true_extent;
@@ -148,18 +143,18 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *vc,
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
 
     /* NOTE: It is possible that this request is already completed before
-       entering this handler. This happens when this req handler is called
-       within the same req handler on the same request.
-       Consider this case: req is queued up in SHM queue with ref count of 2:
-       one is for completing the request and another is for dequeueing from
-       the queue. The first called req handler on this request completed
-       this request and decrement ref counter to 1. Request is still in the
-       queue. Within this handler, we call the req handler on the same request
-       for the second time (for example when making progress on SHM queue),
-       and the second called handler also tries to complete this request,
-       which leads to wrong execution.
-       Here we check if req is already completed to prevent processing the
-       same request twice. */
+     * entering this handler. This happens when this req handler is called
+     * within the same req handler on the same request.
+     * Consider this case: req is queued up in SHM queue with ref count of 2:
+     * one is for completing the request and another is for dequeueing from
+     * the queue. The first called req handler on this request completed
+     * this request and decrement ref counter to 1. Request is still in the
+     * queue. Within this handler, we call the req handler on the same request
+     * for the second time (for example when making progress on SHM queue),
+     * and the second called handler also tries to complete this request,
+     * which leads to wrong execution.
+     * Here we check if req is already completed to prevent processing the
+     * same request twice. */
     if (MPID_Request_is_complete(rreq)) {
         *complete = FALSE;
         goto fn_exit;
@@ -188,21 +183,22 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *vc,
     MPIDI_CH3U_Request_complete(rreq);
 
     /* NOTE: finish_op_on_target() must be called after we complete this request,
-       because inside finish_op_on_target() we may call this request handler
-       on the same request again (in release_lock()). Marking this request as
-       completed will prevent us from processing the same request twice. */
-    mpi_errno = finish_op_on_target(win_ptr, vc, FALSE /* has no response data */,
+     * because inside finish_op_on_target() we may call this request handler
+     * on the same request again (in release_lock()). Marking this request as
+     * completed will prevent us from processing the same request twice. */
+    mpi_errno = finish_op_on_target(win_ptr, vc, FALSE /* has no response data */ ,
                                     flags, source_win_handle);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     *complete = TRUE;
 
- fn_exit:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMRECVCOMPLETE);
     return MPI_SUCCESS;
 
     /* --BEGIN ERROR HANDLING-- */
- fn_fail:
+  fn_fail:
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -212,9 +208,7 @@ int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *vc,
 #define FUNCNAME MPIDI_CH3_ReqHandler_GaccumRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
-                                             MPID_Request *rreq,
-                                             int *complete )
+int MPIDI_CH3_ReqHandler_GaccumRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Win *win_ptr;
@@ -233,10 +227,10 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
     if ((rreq->dev.flags) & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
-    MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED);
+        MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED);
     }
     else {
-    MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
+        MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
     }
     get_accum_resp_pkt->request_handle = rreq->dev.resp_request_handle;
     get_accum_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
@@ -256,15 +250,16 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
     MPIU_Object_set_ref(resp_req, 1);
 
     if (!((rreq->dev.flags) & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP)) {
-    MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, rreq->dev.user_count * type_size,
-                        mpi_errno, "GACC resp. buffer");
+        MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, rreq->dev.user_count * type_size,
+                            mpi_errno, "GACC resp. buffer");
     }
 
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 
     if ((rreq->dev.flags) & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
-        void *src = (void *)(rreq->dev.real_user_buf), *dest = (void *)(get_accum_resp_pkt->info.data);
+        void *src = (void *) (rreq->dev.real_user_buf), *dest =
+            (void *) (get_accum_resp_pkt->info.data);
         mpi_errno = immed_copy(src, dest, rreq->dev.user_count * type_size);
         if (mpi_errno != MPI_SUCCESS) {
             if (win_ptr->shm_allocated == TRUE)
@@ -273,22 +268,25 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
         }
     }
     else {
-    if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype)) {
-        MPIU_Memcpy(resp_req->dev.user_buf, rreq->dev.real_user_buf,
-                    rreq->dev.user_count * type_size);
-    } else {
-        MPID_Segment *seg = MPID_Segment_alloc();
-        MPI_Aint last = type_size * rreq->dev.user_count;
-
-        if (seg == NULL) {
-            if (win_ptr->shm_allocated == TRUE)
-                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+        if (MPIR_DATATYPE_IS_PREDEFINED(rreq->dev.datatype)) {
+            MPIU_Memcpy(resp_req->dev.user_buf, rreq->dev.real_user_buf,
+                        rreq->dev.user_count * type_size);
+        }
+        else {
+            MPID_Segment *seg = MPID_Segment_alloc();
+            MPI_Aint last = type_size * rreq->dev.user_count;
+
+            if (seg == NULL) {
+                if (win_ptr->shm_allocated == TRUE)
+                    MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+            }
+            MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
+                                 "MPID_Segment");
+            MPID_Segment_init(rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype,
+                              seg, 0);
+            MPID_Segment_pack(seg, 0, &last, resp_req->dev.user_buf);
+            MPID_Segment_free(seg);
         }
-        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment");
-        MPID_Segment_init(rreq->dev.real_user_buf, rreq->dev.user_count, rreq->dev.datatype, seg, 0);
-        MPID_Segment_pack(seg, 0, &last, resp_req->dev.user_buf);
-        MPID_Segment_free(seg);
-    }
     }
 
     /* accumulate data from tmp_buf into user_buf */
@@ -298,7 +296,8 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
 
     resp_req->dev.OnFinal = MPIDI_CH3_ReqHandler_GaccumSendComplete;
     resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumSendComplete;
@@ -306,7 +305,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
     resp_req->dev.flags = rreq->dev.flags;
 
     /* here we increment the Active Target counter to guarantee the GET-like
-       operation are completed when counter reaches zero. */
+     * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter++;
 
     if ((rreq->dev.flags) & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
@@ -318,14 +317,14 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
     else {
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)resp_req->dev.user_buf);
+        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) resp_req->dev.user_buf);
         iov[1].MPID_IOV_LEN = rreq->dev.user_count * type_size;
         iovcnt = 2;
     }
 
-    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
+    MPIU_THREAD_CS_ENTER(CH3COMM, vc);
     mpi_errno = MPIDI_CH3_iSendv(vc, resp_req, iov, iovcnt);
-    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
+    MPIU_THREAD_CS_EXIT(CH3COMM, vc);
 
     MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
 
@@ -337,17 +336,17 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
     /* free the temporary buffer */
     MPIR_Type_get_true_extent_impl(rreq->dev.datatype, &true_lb, &true_extent);
     MPIU_Free((char *) rreq->dev.user_buf + true_lb);
-    
+
     /* mark data transfer as complete and decrement CC */
     MPIDI_CH3U_Request_complete(rreq);
     *complete = TRUE;
- fn_exit:
+  fn_exit:
     MPIU_CHKPMEM_COMMIT();
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMRECVCOMPLETE);
     return MPI_SUCCESS;
 
     /* --BEGIN ERROR HANDLING-- */
- fn_fail:
+  fn_fail:
     MPIU_CHKPMEM_REAP();
     goto fn_exit;
     /* --END ERROR HANDLING-- */
@@ -358,9 +357,7 @@ int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *vc,
 #define FUNCNAME MPIDI_CH3_ReqHandler_FOPRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_FOPRecvComplete( MPIDI_VC_t *vc,
-                                          MPID_Request *rreq,
-                                          int *complete )
+int MPIDI_CH3_ReqHandler_FOPRecvComplete(MPIDI_VC_t * vc, MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Win *win_ptr = NULL;
@@ -390,11 +387,10 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete( MPIDI_VC_t *vc,
     resp_req->dev.target_win_handle = rreq->dev.target_win_handle;
     resp_req->dev.flags = rreq->dev.flags;
 
-    MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, type_size,
-                        mpi_errno, "FOP resp. buffer");
+    MPIU_CHKPMEM_MALLOC(resp_req->dev.user_buf, void *, type_size, mpi_errno, "FOP resp. buffer");
 
     /* here we increment the Active Target counter to guarantee the GET-like
-      operation are completed when counter reaches zero. */
+     * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter++;
 
     if (win_ptr->shm_allocated == TRUE)
@@ -412,7 +408,8 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete( MPIDI_VC_t *vc,
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
 
     /* Send back data */
     MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP);
@@ -426,34 +423,34 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete( MPIDI_VC_t *vc,
         (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
         fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
 
-   iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) fop_resp_pkt;
-   iov[0].MPID_IOV_LEN = sizeof(*fop_resp_pkt);
-   iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)resp_req->dev.user_buf);
-   iov[1].MPID_IOV_LEN = type_size;
-   iovcnt = 2;
-
-   MPIU_THREAD_CS_ENTER(CH3COMM,vc);
-   mpi_errno = MPIDI_CH3_iSendv(vc, resp_req, iov, iovcnt);
-   MPIU_THREAD_CS_EXIT(CH3COMM,vc);
-
-   MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-
-   /* free the temporary buffer */
-   MPIU_Free((char *) rreq->dev.user_buf);
-
-   /* mark data transfer as complete and decrement CC */
-   MPIDI_CH3U_Request_complete(rreq);
-   *complete = TRUE;
-
- fn_exit:
-   MPIU_CHKPMEM_COMMIT();
-   MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_FOPRECVCOMPLETE);
-   return MPI_SUCCESS;
-   /* --BEGIN ERROR HANDLING-- */
- fn_fail:
-   MPIU_CHKPMEM_REAP();
-   goto fn_exit;
-   /* --END ERROR HANDLING-- */
+    iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) fop_resp_pkt;
+    iov[0].MPID_IOV_LEN = sizeof(*fop_resp_pkt);
+    iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) resp_req->dev.user_buf);
+    iov[1].MPID_IOV_LEN = type_size;
+    iovcnt = 2;
+
+    MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+    mpi_errno = MPIDI_CH3_iSendv(vc, resp_req, iov, iovcnt);
+    MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+
+    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+    /* free the temporary buffer */
+    MPIU_Free((char *) rreq->dev.user_buf);
+
+    /* mark data transfer as complete and decrement CC */
+    MPIDI_CH3U_Request_complete(rreq);
+    *complete = TRUE;
+
+  fn_exit:
+    MPIU_CHKPMEM_COMMIT();
+    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_FOPRECVCOMPLETE);
+    return MPI_SUCCESS;
+    /* --BEGIN ERROR HANDLING-- */
+  fn_fail:
+    MPIU_CHKPMEM_REAP();
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
 }
 
 
@@ -461,49 +458,46 @@ int MPIDI_CH3_ReqHandler_FOPRecvComplete( MPIDI_VC_t *vc,
 #define FUNCNAME MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
-						   MPID_Request *rreq, 
-						   int *complete )
+int MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((unused)),
+                                                  MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
-    
+
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
-                
+
     /* create derived datatype */
     create_derived_datatype(rreq, &new_dtp);
-    
+
     /* update request to get the data */
     MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_PUT_RESP);
     rreq->dev.datatype = new_dtp->handle;
-    rreq->dev.recv_data_sz = new_dtp->size * rreq->dev.user_count; 
-    
+    rreq->dev.recv_data_sz = new_dtp->size * rreq->dev.user_count;
+
     rreq->dev.datatype_ptr = new_dtp;
     /* this will cause the datatype to be freed when the
-       request is freed. free dtype_info here. */
+     * request is freed. free dtype_info here. */
     MPIU_Free(rreq->dev.dtype_info);
-    
-    rreq->dev.segment_ptr = MPID_Segment_alloc( );
-    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
+
+    rreq->dev.segment_ptr = MPID_Segment_alloc();
+    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem",
+                         "**nomem %s", "MPID_Segment_alloc");
 
     MPID_Segment_init(rreq->dev.user_buf,
-		      rreq->dev.user_count,
-		      rreq->dev.datatype,
-		      rreq->dev.segment_ptr, 0);
+                      rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0);
     rreq->dev.segment_first = 0;
     rreq->dev.segment_size = rreq->dev.recv_data_sz;
-    
+
     mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
     if (mpi_errno != MPI_SUCCESS) {
-	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
-			    "**ch3|loadrecviov");
+        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|loadrecviov");
     }
-    if (!rreq->dev.OnDataAvail) 
-	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutRecvComplete;
-    
+    if (!rreq->dev.OnDataAvail)
+        rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutRecvComplete;
+
     *complete = FALSE;
- fn_fail:
+  fn_fail:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_PUTDERIVEDDTRECVCOMPLETE);
     return mpi_errno;
 }
@@ -512,68 +506,63 @@ int MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unu
 #define FUNCNAME MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
-						     MPID_Request *rreq, 
-						     int *complete )
+int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((unused)),
+                                                    MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
     MPI_Aint true_lb, true_extent, extent;
     void *tmp_buf;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
-    
+
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
-    
+
     /* create derived datatype */
     create_derived_datatype(rreq, &new_dtp);
-    
+
     /* update new request to get the data */
     MPIDI_Request_set_type(rreq, MPIDI_REQUEST_TYPE_ACCUM_RESP);
-    
+
     /* first need to allocate tmp_buf to recv the data into */
-    
+
     MPIR_Type_get_true_extent_impl(new_dtp->handle, &true_lb, &true_extent);
-    MPID_Datatype_get_extent_macro(new_dtp->handle, extent); 
-    
-    tmp_buf = MPIU_Malloc(rreq->dev.user_count * 
-			  (MPIR_MAX(extent,true_extent)));  
+    MPID_Datatype_get_extent_macro(new_dtp->handle, extent);
+
+    tmp_buf = MPIU_Malloc(rreq->dev.user_count * (MPIR_MAX(extent, true_extent)));
     if (!tmp_buf) {
-	MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %d",
-		    rreq->dev.user_count * MPIR_MAX(extent,true_extent));
+        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
+                             rreq->dev.user_count * MPIR_MAX(extent, true_extent));
     }
-    
+
     /* adjust for potential negative lower bound in datatype */
-    tmp_buf = (void *)((char*)tmp_buf - true_lb);
-    
+    tmp_buf = (void *) ((char *) tmp_buf - true_lb);
+
     rreq->dev.user_buf = tmp_buf;
     rreq->dev.datatype = new_dtp->handle;
-    rreq->dev.recv_data_sz = new_dtp->size *
-	rreq->dev.user_count; 
+    rreq->dev.recv_data_sz = new_dtp->size * rreq->dev.user_count;
     rreq->dev.datatype_ptr = new_dtp;
     /* this will cause the datatype to be freed when the
-       request is freed. free dtype_info here. */
+     * request is freed. free dtype_info here. */
     MPIU_Free(rreq->dev.dtype_info);
-    
-    rreq->dev.segment_ptr = MPID_Segment_alloc( );
-    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
+
+    rreq->dev.segment_ptr = MPID_Segment_alloc();
+    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem",
+                         "**nomem %s", "MPID_Segment_alloc");
 
     MPID_Segment_init(rreq->dev.user_buf,
-		      rreq->dev.user_count,
-		      rreq->dev.datatype,
-		      rreq->dev.segment_ptr, 0);
+                      rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0);
     rreq->dev.segment_first = 0;
     rreq->dev.segment_size = rreq->dev.recv_data_sz;
-    
+
     mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
     if (mpi_errno != MPI_SUCCESS) {
-	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
-			    "**ch3|loadrecviov");
+        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|loadrecviov");
     }
     if (!rreq->dev.OnDataAvail)
-	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumRecvComplete;
-    
+        rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumRecvComplete;
+
     *complete = FALSE;
- fn_fail:
+  fn_fail:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_ACCUMDERIVEDDTRECVCOMPLETE);
     return mpi_errno;
 }
@@ -583,9 +572,8 @@ int MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((u
 #define FUNCNAME MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
-                                                      MPID_Request *rreq,
-                                                      int *complete )
+int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(MPIDI_VC_t * vc ATTRIBUTE((unused)),
+                                                     MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
@@ -608,45 +596,41 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((
     MPIR_Type_get_true_extent_impl(new_dtp->handle, &true_lb, &true_extent);
     MPID_Datatype_get_extent_macro(new_dtp->handle, extent);
 
-    tmp_buf = MPIU_Malloc(rreq->dev.user_count *
-			  (MPIR_MAX(extent,true_extent)));
+    tmp_buf = MPIU_Malloc(rreq->dev.user_count * (MPIR_MAX(extent, true_extent)));
     if (!tmp_buf) {
-	MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %d",
-		    rreq->dev.user_count * MPIR_MAX(extent,true_extent));
+        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
+                             rreq->dev.user_count * MPIR_MAX(extent, true_extent));
     }
 
     /* adjust for potential negative lower bound in datatype */
-    tmp_buf = (void *)((char*)tmp_buf - true_lb);
+    tmp_buf = (void *) ((char *) tmp_buf - true_lb);
 
     rreq->dev.user_buf = tmp_buf;
     rreq->dev.datatype = new_dtp->handle;
-    rreq->dev.recv_data_sz = new_dtp->size *
-	rreq->dev.user_count;
+    rreq->dev.recv_data_sz = new_dtp->size * rreq->dev.user_count;
     rreq->dev.datatype_ptr = new_dtp;
     /* this will cause the datatype to be freed when the
-       request is freed. free dtype_info here. */
+     * request is freed. free dtype_info here. */
     MPIU_Free(rreq->dev.dtype_info);
 
-    rreq->dev.segment_ptr = MPID_Segment_alloc( );
-    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
+    rreq->dev.segment_ptr = MPID_Segment_alloc();
+    MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem",
+                         "**nomem %s", "MPID_Segment_alloc");
 
     MPID_Segment_init(rreq->dev.user_buf,
-		      rreq->dev.user_count,
-		      rreq->dev.datatype,
-		      rreq->dev.segment_ptr, 0);
+                      rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0);
     rreq->dev.segment_first = 0;
     rreq->dev.segment_size = rreq->dev.recv_data_sz;
 
     mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
     if (mpi_errno != MPI_SUCCESS) {
-	MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
-			    "**ch3|loadrecviov");
+        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|loadrecviov");
     }
     if (!rreq->dev.OnDataAvail)
-	rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumRecvComplete;
+        rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumRecvComplete;
 
     *complete = FALSE;
- fn_fail:
+  fn_fail:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMDERIVEDDTRECVCOMPLETE);
     return mpi_errno;
 }
@@ -657,20 +641,19 @@ int MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete( MPIDI_VC_t *vc ATTRIBUTE((
 #define FUNCNAME MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *vc,
-						   MPID_Request *rreq, 
-						   int *complete )
+int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete(MPIDI_VC_t * vc,
+                                                  MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Datatype *new_dtp = NULL;
     MPIDI_CH3_Pkt_t upkt;
-    MPIDI_CH3_Pkt_get_resp_t * get_resp_pkt = &upkt.get_resp;
-    MPID_Request * sreq;
+    MPIDI_CH3_Pkt_get_resp_t *get_resp_pkt = &upkt.get_resp;
+    MPID_Request *sreq;
     MPID_Win *win_ptr;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
-    
+
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
-                
+
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
     MPIU_Assert(!(rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP));
@@ -678,24 +661,24 @@ int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *vc,
     /* create derived datatype */
     create_derived_datatype(rreq, &new_dtp);
     MPIU_Free(rreq->dev.dtype_info);
-    
+
     /* create request for sending data */
     sreq = MPID_Request_create();
-    MPIU_ERR_CHKANDJUMP(sreq == NULL, mpi_errno,MPI_ERR_OTHER,"**nomemreq");
-    
+    MPIU_ERR_CHKANDJUMP(sreq == NULL, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+
     sreq->kind = MPID_REQUEST_SEND;
     MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_GET_RESP);
     sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GetSendComplete;
-    sreq->dev.OnFinal     = MPIDI_CH3_ReqHandler_GetSendComplete;
+    sreq->dev.OnFinal = MPIDI_CH3_ReqHandler_GetSendComplete;
     sreq->dev.user_buf = rreq->dev.user_buf;
     sreq->dev.user_count = rreq->dev.user_count;
     sreq->dev.datatype = new_dtp->handle;
     sreq->dev.datatype_ptr = new_dtp;
     sreq->dev.target_win_handle = rreq->dev.target_win_handle;
     sreq->dev.flags = rreq->dev.flags;
-    
+
     MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
-    get_resp_pkt->request_handle = rreq->dev.request_handle;    
+    get_resp_pkt->request_handle = rreq->dev.request_handle;
     get_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
     get_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
     if (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
@@ -704,36 +687,34 @@ int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *vc,
     if ((rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
         (rreq->dev.flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
         get_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
-    
-    sreq->dev.segment_ptr = MPID_Segment_alloc( );
-    MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc");
+
+    sreq->dev.segment_ptr = MPID_Segment_alloc();
+    MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem",
+                         "**nomem %s", "MPID_Segment_alloc");
 
     MPID_Segment_init(sreq->dev.user_buf,
-		      sreq->dev.user_count,
-		      sreq->dev.datatype,
-		      sreq->dev.segment_ptr, 0);
+                      sreq->dev.user_count, sreq->dev.datatype, sreq->dev.segment_ptr, 0);
     sreq->dev.segment_first = 0;
     sreq->dev.segment_size = new_dtp->size * sreq->dev.user_count;
 
-    /* Because this is in a packet handler, it is already within a critical section */	
+    /* Because this is in a packet handler, it is already within a critical section */
     /* MPIU_THREAD_CS_ENTER(CH3COMM,vc); */
     mpi_errno = vc->sendNoncontig_fn(vc, sreq, get_resp_pkt, sizeof(*get_resp_pkt));
     /* MPIU_THREAD_CS_EXIT(CH3COMM,vc); */
     /* --BEGIN ERROR HANDLING-- */
-    if (mpi_errno != MPI_SUCCESS)
-    {
+    if (mpi_errno != MPI_SUCCESS) {
         MPID_Request_release(sreq);
         sreq = NULL;
-        MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|rmamsg");
+        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
     /* --END ERROR HANDLING-- */
-    
-    /* mark receive data transfer as complete and decrement CC in receive 
-       request */
+
+    /* mark receive data transfer as complete and decrement CC in receive
+     * request */
     MPIDI_CH3U_Request_complete(rreq);
     *complete = TRUE;
-    
- fn_fail:
+
+  fn_fail:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GETDERIVEDDTRECVCOMPLETE);
     return mpi_errno;
 }
@@ -743,35 +724,31 @@ int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *vc,
 #define FUNCNAME MPIDI_CH3_ReqHandler_UnpackUEBufComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_UnpackUEBufComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
-					      MPID_Request *rreq, 
-					      int *complete )
+int MPIDI_CH3_ReqHandler_UnpackUEBufComplete(MPIDI_VC_t * vc ATTRIBUTE((unused)),
+                                             MPID_Request * rreq, int *complete)
 {
     int recv_pending;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKUEBUFCOMPLETE);
-    
+
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKUEBUFCOMPLETE);
-    
+
     MPIDI_Request_decr_pending(rreq);
     MPIDI_Request_check_pending(rreq, &recv_pending);
-    if (!recv_pending)
-    { 
-	if (rreq->dev.recv_data_sz > 0)
-	{
-	    MPIDI_CH3U_Request_unpack_uebuf(rreq);
-	    MPIU_Free(rreq->dev.tmpbuf);
-	}
-    }
-    else
-    {
-	/* The receive has not been posted yet.  MPID_{Recv/Irecv}() 
-	   is responsible for unpacking the buffer. */
-    }
-    
+    if (!recv_pending) {
+        if (rreq->dev.recv_data_sz > 0) {
+            MPIDI_CH3U_Request_unpack_uebuf(rreq);
+            MPIU_Free(rreq->dev.tmpbuf);
+        }
+    }
+    else {
+        /* The receive has not been posted yet.  MPID_{Recv/Irecv}()
+         * is responsible for unpacking the buffer. */
+    }
+
     /* mark data transfer as complete and decrement CC */
     MPIDI_CH3U_Request_complete(rreq);
     *complete = TRUE;
-    
+
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKUEBUFCOMPLETE);
     return MPI_SUCCESS;
 }
@@ -780,36 +757,28 @@ int MPIDI_CH3_ReqHandler_UnpackUEBufComplete( MPIDI_VC_t *vc ATTRIBUTE((unused))
 #define FUNCNAME MPIDI_CH3_ReqHandler_UnpackSRBufComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_UnpackSRBufComplete( MPIDI_VC_t *vc, 
-					      MPID_Request *rreq, 
-					      int *complete )
+int MPIDI_CH3_ReqHandler_UnpackSRBufComplete(MPIDI_VC_t * vc, MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFCOMPLETE);
-    
+
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFCOMPLETE);
 
     MPIDI_CH3U_Request_unpack_srbuf(rreq);
 
-    if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_PUT_RESP)
-    {
-	mpi_errno = MPIDI_CH3_ReqHandler_PutRecvComplete(
-	    vc, rreq, complete );
+    if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_PUT_RESP) {
+        mpi_errno = MPIDI_CH3_ReqHandler_PutRecvComplete(vc, rreq, complete);
     }
-    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP)
-    {
-	mpi_errno = MPIDI_CH3_ReqHandler_AccumRecvComplete(
-	    vc, rreq, complete );
+    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP) {
+        mpi_errno = MPIDI_CH3_ReqHandler_AccumRecvComplete(vc, rreq, complete);
     }
-    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RESP)
-    {
-	mpi_errno = MPIDI_CH3_ReqHandler_GaccumRecvComplete(
-	    vc, rreq, complete );
+    else if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_GET_ACCUM_RESP) {
+        mpi_errno = MPIDI_CH3_ReqHandler_GaccumRecvComplete(vc, rreq, complete);
     }
     else {
-	/* mark data transfer as complete and decrement CC */
-	MPIDI_CH3U_Request_complete(rreq);
-	*complete = TRUE;
+        /* mark data transfer as complete and decrement CC */
+        MPIDI_CH3U_Request_complete(rreq);
+        *complete = TRUE;
     }
 
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFCOMPLETE);
@@ -820,22 +789,21 @@ int MPIDI_CH3_ReqHandler_UnpackSRBufComplete( MPIDI_VC_t *vc,
 #define FUNCNAME MPIDI_CH3_ReqHandler_UnpackSRBufReloadIOV
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_UnpackSRBufReloadIOV( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
-					      MPID_Request *rreq, 
-					      int *complete )
+int MPIDI_CH3_ReqHandler_UnpackSRBufReloadIOV(MPIDI_VC_t * vc ATTRIBUTE((unused)),
+                                              MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFRELOADIOV);
-    
+
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFRELOADIOV);
 
     MPIDI_CH3U_Request_unpack_srbuf(rreq);
     mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
     if (mpi_errno != MPI_SUCCESS) {
-	MPIU_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|loadrecviov" );
+        MPIU_ERR_SETFATALANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|loadrecviov");
     }
     *complete = FALSE;
- fn_fail:
+  fn_fail:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_UNPACKSRBUFRELOADIOV);
     return mpi_errno;
 }
@@ -844,20 +812,20 @@ int MPIDI_CH3_ReqHandler_UnpackSRBufReloadIOV( MPIDI_VC_t *vc ATTRIBUTE((unused)
 #define FUNCNAME MPIDI_CH3_ReqHandler_ReloadIOV
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_ReloadIOV( MPIDI_VC_t *vc ATTRIBUTE((unused)), 
-				    MPID_Request *rreq, int *complete )
+int MPIDI_CH3_ReqHandler_ReloadIOV(MPIDI_VC_t * vc ATTRIBUTE((unused)),
+                                   MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_REQHANDLER_RELOADIOV);
-    
+
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_RELOADIOV);
 
     mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
     if (mpi_errno != MPI_SUCCESS) {
-	MPIU_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|loadrecviov");
+        MPIU_ERR_SETFATALANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|loadrecviov");
     }
     *complete = FALSE;
- fn_fail:
+  fn_fail:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_RELOADIOV);
     return mpi_errno;
 }
@@ -869,14 +837,14 @@ int MPIDI_CH3_ReqHandler_ReloadIOV( MPIDI_VC_t *vc ATTRIBUTE((unused)),
 #define FUNCNAME create_derived_datatype
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static int create_derived_datatype(MPID_Request *req, MPID_Datatype **dtp)
+static int create_derived_datatype(MPID_Request * req, MPID_Datatype ** dtp)
 {
     MPIDI_RMA_dtype_info *dtype_info;
     MPID_Datatype *new_dtp;
-    int mpi_errno=MPI_SUCCESS;
+    int mpi_errno = MPI_SUCCESS;
     MPI_Aint ptrdiff;
     MPIDI_STATE_DECL(MPID_STATE_CREATE_DERIVED_DATATYPE);
-    
+
     MPIDI_FUNC_ENTER(MPID_STATE_CREATE_DERIVED_DATATYPE);
 
     dtype_info = req->dev.dtype_info;
@@ -884,29 +852,29 @@ static int create_derived_datatype(MPID_Request *req, MPID_Datatype **dtp)
     /* allocate new datatype object and handle */
     new_dtp = (MPID_Datatype *) MPIU_Handle_obj_alloc(&MPID_Datatype_mem);
     if (!new_dtp) {
-	MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %s",
-			     "MPID_Datatype_mem" );
+        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
+                             "MPID_Datatype_mem");
     }
 
     *dtp = new_dtp;
-            
+
     /* Note: handle is filled in by MPIU_Handle_obj_alloc() */
     MPIU_Object_set_ref(new_dtp, 1);
     new_dtp->is_permanent = 0;
     new_dtp->is_committed = 1;
-    new_dtp->attributes   = 0;
-    new_dtp->cache_id     = 0;
-    new_dtp->name[0]      = 0;
+    new_dtp->attributes = 0;
+    new_dtp->cache_id = 0;
+    new_dtp->name[0] = 0;
     new_dtp->is_contig = dtype_info->is_contig;
-    new_dtp->max_contig_blocks = dtype_info->max_contig_blocks; 
+    new_dtp->max_contig_blocks = dtype_info->max_contig_blocks;
     new_dtp->size = dtype_info->size;
     new_dtp->extent = dtype_info->extent;
     new_dtp->dataloop_size = dtype_info->dataloop_size;
-    new_dtp->dataloop_depth = dtype_info->dataloop_depth; 
+    new_dtp->dataloop_depth = dtype_info->dataloop_depth;
     new_dtp->eltype = dtype_info->eltype;
     /* set dataloop pointer */
     new_dtp->dataloop = req->dev.dataloop;
-    
+
     new_dtp->ub = dtype_info->ub;
     new_dtp->lb = dtype_info->lb;
     new_dtp->true_ub = dtype_info->true_ub;
@@ -914,9 +882,9 @@ static int create_derived_datatype(MPID_Request *req, MPID_Datatype **dtp)
     new_dtp->has_sticky_ub = dtype_info->has_sticky_ub;
     new_dtp->has_sticky_lb = dtype_info->has_sticky_lb;
     /* update pointers in dataloop */
-    ptrdiff = (MPI_Aint)((char *) (new_dtp->dataloop) - (char *)
-                         (dtype_info->dataloop));
-    
+    ptrdiff = (MPI_Aint) ((char *) (new_dtp->dataloop) - (char *)
+                          (dtype_info->dataloop));
+
     /* FIXME: Temp to avoid SEGV when memory tracing */
     new_dtp->hetero_dloop = 0;
 
@@ -924,14 +892,14 @@ static int create_derived_datatype(MPID_Request *req, MPID_Datatype **dtp)
 
     new_dtp->contents = NULL;
 
- fn_fail:
+  fn_fail:
     MPIDI_FUNC_EXIT(MPID_STATE_CREATE_DERIVED_DATATYPE);
 
     return mpi_errno;
 }
 
 
-static inline int perform_put_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
+static inline int perform_put_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_entry_t * lock_entry)
 {
     MPIDI_CH3_Pkt_put_t *put_pkt = &((lock_entry->pkt).put);
     int mpi_errno = MPI_SUCCESS;
@@ -946,28 +914,31 @@ static inline int perform_put_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_en
         /* all data fits in packet header */
         mpi_errno = MPIR_Localcopy(put_pkt->info.data, put_pkt->count, put_pkt->datatype,
                                    put_pkt->addr, put_pkt->count, put_pkt->datatype);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
     else {
         MPIU_Assert(put_pkt->type == MPIDI_CH3_PKT_PUT);
 
         mpi_errno = MPIR_Localcopy(lock_entry->data, put_pkt->count, put_pkt->datatype,
                                    put_pkt->addr, put_pkt->count, put_pkt->datatype);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     /* do final action */
-    mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, FALSE /* has no response data */,
+    mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, FALSE /* has no response data */ ,
                                     put_pkt->flags, put_pkt->source_win_handle);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
-static inline int perform_get_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
+static inline int perform_get_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_entry_t * lock_entry)
 {
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_get_resp_t *get_resp_pkt = &upkt.get_resp;
@@ -1000,14 +971,14 @@ static inline int perform_get_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_en
     sreq->dev.flags = get_pkt->flags;
 
     /* here we increment the Active Target counter to guarantee the GET-like
-       operation are completed when counter reaches zero. */
+     * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter++;
 
     if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
         MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP_IMMED);
     }
     else {
-    MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
+        MPIDI_Pkt_init(get_resp_pkt, MPIDI_CH3_PKT_GET_RESP);
     }
     get_resp_pkt->request_handle = get_pkt->request_handle;
     get_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
@@ -1024,9 +995,10 @@ static inline int perform_get_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_en
     MPIU_Assign_trunc(len, get_pkt->count * type_size, size_t);
 
     if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
-        void *src = (void *)(get_pkt->addr), *dest = (void *)(get_resp_pkt->info.data);
+        void *src = (void *) (get_pkt->addr), *dest = (void *) (get_resp_pkt->info.data);
         mpi_errno = immed_copy(src, dest, len);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
@@ -1046,17 +1018,17 @@ static inline int perform_get_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_en
     mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, sreq, iov, iovcnt);
     if (mpi_errno != MPI_SUCCESS) {
         MPID_Request_release(sreq);
-	MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
 
-static inline int perform_acc_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
+static inline int perform_acc_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_entry_t * lock_entry)
 {
     MPIDI_CH3_Pkt_accum_t *acc_pkt = &((lock_entry->pkt).accum);
     int mpi_errno = MPI_SUCCESS;
@@ -1084,20 +1056,23 @@ static inline int perform_acc_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_en
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
-    mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, FALSE /* has no response data */,
+    mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, FALSE /* has no response data */ ,
                                     acc_pkt->flags, acc_pkt->source_win_handle);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
 
-static inline int perform_get_acc_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
+static inline int perform_get_acc_in_lock_queue(MPID_Win * win_ptr,
+                                                MPIDI_RMA_Lock_entry_t * lock_entry)
 {
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_get_accum_resp_t *get_accum_resp_pkt = &upkt.get_accum_resp;
@@ -1132,7 +1107,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Loc
     /* Copy data into a temporary buffer */
     MPID_Datatype_get_size_macro(get_accum_pkt->datatype, type_size);
     if (!(get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP))
-    sreq->dev.user_buf = (void *)MPIU_Malloc(get_accum_pkt->count * type_size);
+        sreq->dev.user_buf = (void *) MPIU_Malloc(get_accum_pkt->count * type_size);
     else {
         MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED);
     }
@@ -1145,7 +1120,8 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Loc
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 
     if (get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
-        void *src = (void *)(get_accum_pkt->addr), *dest = (void *)(get_accum_resp_pkt->info.data);
+        void *src = (void *) (get_accum_pkt->addr), *dest =
+            (void *) (get_accum_resp_pkt->info.data);
         mpi_errno = immed_copy(src, dest, len);
         if (mpi_errno != MPI_SUCCESS) {
             if (win_ptr->shm_allocated == TRUE)
@@ -1154,48 +1130,52 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Loc
         }
     }
     else {
-    if (MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype)) {
-        MPIU_Memcpy(sreq->dev.user_buf, get_accum_pkt->addr,
-                    get_accum_pkt->count * type_size);
-    } else {
-        MPID_Segment *seg = MPID_Segment_alloc();
-        MPI_Aint last = type_size * get_accum_pkt->count;
-
-        if (seg == NULL) {
-            if (win_ptr->shm_allocated == TRUE)
-                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+        if (MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype)) {
+            MPIU_Memcpy(sreq->dev.user_buf, get_accum_pkt->addr, get_accum_pkt->count * type_size);
+        }
+        else {
+            MPID_Segment *seg = MPID_Segment_alloc();
+            MPI_Aint last = type_size * get_accum_pkt->count;
+
+            if (seg == NULL) {
+                if (win_ptr->shm_allocated == TRUE)
+                    MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+            }
+            MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
+                                 "MPID_Segment");
+            MPID_Segment_init(get_accum_pkt->addr, get_accum_pkt->count, get_accum_pkt->datatype,
+                              seg, 0);
+            MPID_Segment_pack(seg, 0, &last, sreq->dev.user_buf);
+            MPID_Segment_free(seg);
         }
-        MPIU_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment");
-        MPID_Segment_init(get_accum_pkt->addr, get_accum_pkt->count,
-                          get_accum_pkt->datatype, seg, 0);
-        MPID_Segment_pack(seg, 0, &last, sreq->dev.user_buf);
-        MPID_Segment_free(seg);
-    }
     }
 
     if (get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM_IMMED) {
         /* All data fits in packet header */
         mpi_errno = do_accumulate_op(get_accum_pkt->info.data, get_accum_pkt->addr,
-                                     get_accum_pkt->count, get_accum_pkt->datatype, get_accum_pkt->op);
+                                     get_accum_pkt->count, get_accum_pkt->datatype,
+                                     get_accum_pkt->op);
     }
     else {
         MPIU_Assert(get_accum_pkt->type == MPIDI_CH3_PKT_GET_ACCUM);
 
         mpi_errno = do_accumulate_op(lock_entry->data, get_accum_pkt->addr,
-                                     get_accum_pkt->count, get_accum_pkt->datatype, get_accum_pkt->op);
+                                     get_accum_pkt->count, get_accum_pkt->datatype,
+                                     get_accum_pkt->op);
     }
 
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     /* here we increment the Active Target counter to guarantee the GET-like
-       operation are completed when counter reaches zero. */
+     * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter++;
 
     if (!(get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP)) {
-    MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
+        MPIDI_Pkt_init(get_accum_resp_pkt, MPIDI_CH3_PKT_GET_ACCUM_RESP);
     }
     get_accum_resp_pkt->request_handle = get_accum_pkt->request_handle;
     get_accum_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
@@ -1216,7 +1196,7 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Loc
     else {
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)sreq->dev.user_buf);
+        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) sreq->dev.user_buf);
         iov[1].MPID_IOV_LEN = get_accum_pkt->count * type_size;
         iovcnt = 2;
     }
@@ -1224,17 +1204,17 @@ static inline int perform_get_acc_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Loc
     mpi_errno = MPIDI_CH3_iSendv(lock_entry->vc, sreq, iov, iovcnt);
     if (mpi_errno != MPI_SUCCESS) {
         MPID_Request_release(sreq);
-	MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
     }
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
 
-static inline int perform_fop_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
+static inline int perform_fop_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_entry_t * lock_entry)
 {
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_fop_resp_t *fop_resp_pkt = &upkt.fop_resp;
@@ -1252,7 +1232,7 @@ static inline int perform_fop_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_en
     MPIU_Assert(lock_entry->all_data_recved == 1);
 
     /* FIXME: this function is same with PktHandler_FOP(), should
-       do code refactoring on both of them. */
+     * do code refactoring on both of them. */
 
     MPID_Datatype_get_size_macro(fop_pkt->datatype, type_size);
 
@@ -1260,7 +1240,7 @@ static inline int perform_fop_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_en
         MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP_IMMED);
     }
     else {
-    MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP);
+        MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP);
     }
 
     fop_resp_pkt->request_handle = fop_pkt->request_handle;
@@ -1286,10 +1266,10 @@ static inline int perform_fop_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_en
         resp_req->dev.target_win_handle = win_ptr->handle;
         resp_req->dev.flags = fop_pkt->flags;
 
-        resp_req->dev.user_buf = (void *)MPIU_Malloc(type_size);
+        resp_req->dev.user_buf = (void *) MPIU_Malloc(type_size);
 
         /* here we increment the Active Target counter to guarantee the GET-like
-           operation are completed when counter reaches zero. */
+         * operation are completed when counter reaches zero. */
         win_ptr->at_completion_counter++;
     }
 
@@ -1297,68 +1277,69 @@ static inline int perform_fop_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_en
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 
     if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
-    /* copy data to resp pkt header */
-    void *src = fop_pkt->addr, *dest = fop_resp_pkt->info.data;
-    mpi_errno = immed_copy(src, dest, type_size);
-    if (mpi_errno != MPI_SUCCESS) {
-        if (win_ptr->shm_allocated == TRUE)
-            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-        MPIU_ERR_POP(mpi_errno);
-    }
+        /* copy data to resp pkt header */
+        void *src = fop_pkt->addr, *dest = fop_resp_pkt->info.data;
+        mpi_errno = immed_copy(src, dest, type_size);
+        if (mpi_errno != MPI_SUCCESS) {
+            if (win_ptr->shm_allocated == TRUE)
+                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+            MPIU_ERR_POP(mpi_errno);
+        }
     }
     else {
-        MPIU_Memcpy(resp_req->dev.user_buf, fop_pkt->addr,
-                    type_size);
+        MPIU_Memcpy(resp_req->dev.user_buf, fop_pkt->addr, type_size);
     }
 
     /* Apply the op */
     if (fop_pkt->op != MPI_NO_OP) {
         if (fop_pkt->type == MPIDI_CH3_PKT_FOP_IMMED) {
-        mpi_errno = do_accumulate_op(fop_pkt->info.data, fop_pkt->addr,
-                                     1, fop_pkt->datatype, fop_pkt->op);
+            mpi_errno = do_accumulate_op(fop_pkt->info.data, fop_pkt->addr,
+                                         1, fop_pkt->datatype, fop_pkt->op);
         }
         else {
-        mpi_errno = do_accumulate_op(lock_entry->data, fop_pkt->addr,
-                                     1, fop_pkt->datatype, fop_pkt->op);
+            mpi_errno = do_accumulate_op(lock_entry->data, fop_pkt->addr,
+                                         1, fop_pkt->datatype, fop_pkt->op);
         }
     }
 
     if (win_ptr->shm_allocated == TRUE)
         MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
-    /* send back the original data */
-    MPIU_THREAD_CS_ENTER(CH3COMM,lock_entry->vc);
-    mpi_errno = MPIDI_CH3_iStartMsg(lock_entry->vc, fop_resp_pkt, sizeof(*fop_resp_pkt), &resp_req);
-    MPIU_THREAD_CS_EXIT(CH3COMM,lock_entry->vc);
-    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
-
-    if (resp_req != NULL) {
-        if (!MPID_Request_is_complete(resp_req)) {
-            /* sending process is not completed, set proper OnDataAvail
-               (it is initialized to NULL by lower layer) */
-            resp_req->dev.target_win_handle = fop_pkt->target_win_handle;
-            resp_req->dev.flags = fop_pkt->flags;
-            resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_FOPSendComplete;
-
-            /* here we increment the Active Target counter to guarantee the GET-like
-               operation are completed when counter reaches zero. */
-            win_ptr->at_completion_counter++;
-
-            MPID_Request_release(resp_req);
-            goto fn_exit;
+        /* send back the original data */
+        MPIU_THREAD_CS_ENTER(CH3COMM, lock_entry->vc);
+        mpi_errno =
+            MPIDI_CH3_iStartMsg(lock_entry->vc, fop_resp_pkt, sizeof(*fop_resp_pkt), &resp_req);
+        MPIU_THREAD_CS_EXIT(CH3COMM, lock_entry->vc);
+        MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+
+        if (resp_req != NULL) {
+            if (!MPID_Request_is_complete(resp_req)) {
+                /* sending process is not completed, set proper OnDataAvail
+                 * (it is initialized to NULL by lower layer) */
+                resp_req->dev.target_win_handle = fop_pkt->target_win_handle;
+                resp_req->dev.flags = fop_pkt->flags;
+                resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_FOPSendComplete;
+
+                /* here we increment the Active Target counter to guarantee the GET-like
+                 * operation are completed when counter reaches zero. */
+                win_ptr->at_completion_counter++;
+
+                MPID_Request_release(resp_req);
+                goto fn_exit;
+            }
+            else {
+                MPID_Request_release(resp_req);
+            }
         }
-        else {
-            MPID_Request_release(resp_req);
-        }
-    }
     }
     else {
         iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) fop_resp_pkt;
         iov[0].MPID_IOV_LEN = sizeof(*fop_resp_pkt);
-        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)resp_req->dev.user_buf);
+        iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) resp_req->dev.user_buf);
         iov[1].MPID_IOV_LEN = type_size;
         iovcnt = 2;
 
@@ -1371,18 +1352,19 @@ static inline int perform_fop_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_en
     }
 
     /* do final action */
-    mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, TRUE /* has response data */,
+    mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, TRUE /* has response data */ ,
                                     fop_pkt->flags, MPI_WIN_NULL);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
 
-static inline int perform_cas_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
+static inline int perform_cas_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_entry_t * lock_entry)
 {
     MPIDI_CH3_Pkt_t upkt;
     MPIDI_CH3_Pkt_cas_resp_t *cas_resp_pkt = &upkt.cas_resp;
@@ -1435,13 +1417,13 @@ static inline int perform_cas_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_en
     if (send_req != NULL) {
         if (!MPID_Request_is_complete(send_req)) {
             /* sending process is not completed, set proper OnDataAvail
-               (it is initialized to NULL by lower layer) */
+             * (it is initialized to NULL by lower layer) */
             send_req->dev.target_win_handle = cas_pkt->target_win_handle;
             send_req->dev.flags = cas_pkt->flags;
             send_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_CASSendComplete;
 
             /* here we increment the Active Target counter to guarantee the GET-like
-               operation are completed when counter reaches zero. */
+             * operation are completed when counter reaches zero. */
             win_ptr->at_completion_counter++;
 
             MPID_Request_release(send_req);
@@ -1452,18 +1434,19 @@ static inline int perform_cas_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_en
     }
 
     /* do final action */
-    mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, TRUE /* has response data */,
+    mpi_errno = finish_op_on_target(win_ptr, lock_entry->vc, TRUE /* has response data */ ,
                                     cas_pkt->flags, MPI_WIN_NULL);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
 
-static inline int perform_op_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_entry_t *lock_entry)
+static inline int perform_op_in_lock_queue(MPID_Win * win_ptr, MPIDI_RMA_Lock_entry_t * lock_entry)
 {
     int mpi_errno = MPI_SUCCESS;
 
@@ -1478,47 +1461,55 @@ static inline int perform_op_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_ent
 
         if (lock_entry->vc == my_vc) {
             mpi_errno = handle_lock_ack(win_ptr, win_ptr->comm_ptr->rank,
-                                              MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                                        MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
         }
         else {
             mpi_errno = MPIDI_CH3I_Send_lock_ack_pkt(lock_entry->vc, win_ptr,
                                                      MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED,
                                                      lock_pkt->source_win_handle,
                                                      lock_pkt->request_handle);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
         }
     }
     else {
         /* LOCK+OP packet */
-        switch(lock_entry->pkt.type) {
+        switch (lock_entry->pkt.type) {
         case (MPIDI_CH3_PKT_PUT):
         case (MPIDI_CH3_PKT_PUT_IMMED):
             mpi_errno = perform_put_in_lock_queue(win_ptr, lock_entry);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
             break;
         case (MPIDI_CH3_PKT_GET):
             mpi_errno = perform_get_in_lock_queue(win_ptr, lock_entry);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
             break;
         case (MPIDI_CH3_PKT_ACCUMULATE):
         case (MPIDI_CH3_PKT_ACCUMULATE_IMMED):
             mpi_errno = perform_acc_in_lock_queue(win_ptr, lock_entry);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
             break;
         case (MPIDI_CH3_PKT_GET_ACCUM):
         case (MPIDI_CH3_PKT_GET_ACCUM_IMMED):
             mpi_errno = perform_get_acc_in_lock_queue(win_ptr, lock_entry);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
             break;
         case (MPIDI_CH3_PKT_FOP):
         case (MPIDI_CH3_PKT_FOP_IMMED):
             mpi_errno = perform_fop_in_lock_queue(win_ptr, lock_entry);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
             break;
         case (MPIDI_CH3_PKT_CAS_IMMED):
             mpi_errno = perform_cas_in_lock_queue(win_ptr, lock_entry);
-            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno)
+                MPIU_ERR_POP(mpi_errno);
             break;
         default:
             MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER,
@@ -1526,9 +1517,9 @@ static inline int perform_op_in_lock_queue(MPID_Win *win_ptr, MPIDI_RMA_Lock_ent
         }
     }
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -1542,7 +1533,7 @@ static int entered_count = 0;
 #define FUNCNAME MPIDI_CH3I_Release_lock
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3I_Release_lock(MPID_Win *win_ptr)
+int MPIDI_CH3I_Release_lock(MPID_Win * win_ptr)
 {
     MPIDI_RMA_Lock_entry_t *lock_entry, *lock_entry_next;
     int requested_lock, mpi_errno = MPI_SUCCESS, temp_entered_count;
@@ -1557,79 +1548,82 @@ int MPIDI_CH3I_Release_lock(MPID_Win *win_ptr)
     }
 
     /* If shared lock ref count is 0 (which is also true if the lock is an
-       exclusive lock), release the lock. */
+     * exclusive lock), release the lock. */
     if (win_ptr->shared_lock_ref_cnt == 0) {
 
-	/* This function needs to be reentrant even in the single-threaded case
-           because when going through the lock queue, pkt_handler() in
-           perform_op_in_lock_queue() may again call release_lock(). To handle
-           this possibility, we use an entered_flag.
-           If the flag is not 0, we simply increment the entered_count and return.
-           The loop through the lock queue is repeated if the entered_count has
-           changed while we are in the loop.
-	 */
-	if (entered_flag != 0) {
-	    entered_count++; /* Count how many times we re-enter */
-	    goto fn_exit;
-	}
-
-        entered_flag = 1;  /* Mark that we are now entering release_lock() */
+        /* This function needs to be reentrant even in the single-threaded case
+         * because when going through the lock queue, pkt_handler() in
+         * perform_op_in_lock_queue() may again call release_lock(). To handle
+         * this possibility, we use an entered_flag.
+         * If the flag is not 0, we simply increment the entered_count and return.
+         * The loop through the lock queue is repeated if the entered_count has
+         * changed while we are in the loop.
+         */
+        if (entered_flag != 0) {
+            entered_count++;    /* Count how many times we re-enter */
+            goto fn_exit;
+        }
+
+        entered_flag = 1;       /* Mark that we are now entering release_lock() */
         temp_entered_count = entered_count;
 
-	do {
-	    if (temp_entered_count != entered_count) temp_entered_count++;
+        do {
+            if (temp_entered_count != entered_count)
+                temp_entered_count++;
 
-	    /* FIXME: MT: The setting of the lock type must be done atomically */
-	    win_ptr->current_lock_type = MPID_LOCK_NONE;
+            /* FIXME: MT: The setting of the lock type must be done atomically */
+            win_ptr->current_lock_type = MPID_LOCK_NONE;
 
-	    /* If there is a lock queue, try to satisfy as many lock requests as 
-	       possible. If the first one is a shared lock, grant it and grant all 
-	       other shared locks. If the first one is an exclusive lock, grant 
-	       only that one. */
+            /* If there is a lock queue, try to satisfy as many lock requests as
+             * possible. If the first one is a shared lock, grant it and grant all
+             * other shared locks. If the first one is an exclusive lock, grant
+             * only that one. */
 
-	    /* FIXME: MT: All queue accesses need to be made atomic */
+            /* FIXME: MT: All queue accesses need to be made atomic */
             lock_entry = (MPIDI_RMA_Lock_entry_t *) win_ptr->lock_queue;
             while (lock_entry) {
                 lock_entry_next = lock_entry->next;
 
                 if (lock_entry->all_data_recved) {
-                MPIDI_CH3_Pkt_flags_t flags;
-                MPIDI_CH3_PKT_RMA_GET_FLAGS(lock_entry->pkt, flags, mpi_errno);
-                if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED)
-                    requested_lock = MPI_LOCK_SHARED;
-                else {
-                    MPIU_Assert(flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE);
-                    requested_lock = MPI_LOCK_EXCLUSIVE;
-                }
-                if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, requested_lock) == 1) {
-                    /* dequeue entry from lock queue */
-                    MPL_LL_DELETE(win_ptr->lock_queue, win_ptr->lock_queue_tail, lock_entry);
-
-                    /* perform this OP */
-                    mpi_errno = perform_op_in_lock_queue(win_ptr, lock_entry);
-                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
-                    /* free this entry */
-                    mpi_errno = MPIDI_CH3I_Win_lock_entry_free(win_ptr, lock_entry);
-                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-
-                    /* if the granted lock is exclusive,
-                       no need to continue */
-                    if (requested_lock == MPI_LOCK_EXCLUSIVE)
-                        break;
-                }
+                    MPIDI_CH3_Pkt_flags_t flags;
+                    MPIDI_CH3_PKT_RMA_GET_FLAGS(lock_entry->pkt, flags, mpi_errno);
+                    if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED)
+                        requested_lock = MPI_LOCK_SHARED;
+                    else {
+                        MPIU_Assert(flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE);
+                        requested_lock = MPI_LOCK_EXCLUSIVE;
+                    }
+                    if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, requested_lock) == 1) {
+                        /* dequeue entry from lock queue */
+                        MPL_LL_DELETE(win_ptr->lock_queue, win_ptr->lock_queue_tail, lock_entry);
+
+                        /* perform this OP */
+                        mpi_errno = perform_op_in_lock_queue(win_ptr, lock_entry);
+                        if (mpi_errno != MPI_SUCCESS)
+                            MPIU_ERR_POP(mpi_errno);
+
+                        /* free this entry */
+                        mpi_errno = MPIDI_CH3I_Win_lock_entry_free(win_ptr, lock_entry);
+                        if (mpi_errno != MPI_SUCCESS)
+                            MPIU_ERR_POP(mpi_errno);
+
+                        /* if the granted lock is exclusive,
+                         * no need to continue */
+                        if (requested_lock == MPI_LOCK_EXCLUSIVE)
+                            break;
+                    }
                 }
                 lock_entry = lock_entry_next;
-	    }
-	} while (temp_entered_count != entered_count);
+            }
+        } while (temp_entered_count != entered_count);
 
-	entered_count = entered_flag = 0;
+        entered_count = entered_flag = 0;
     }
 
- fn_exit:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_RELEASE_LOCK);
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -1639,9 +1633,8 @@ int MPIDI_CH3I_Release_lock(MPID_Win *win_ptr)
 #define FUNCNAME MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete( MPIDI_VC_t *vc,
-                                                      MPID_Request *rreq,
-                                                      int *complete )
+int MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete(MPIDI_VC_t * vc,
+                                                     MPID_Request * rreq, int *complete)
 {
     int requested_lock;
     MPI_Win target_win_handle;
@@ -1654,51 +1647,54 @@ int MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete( MPIDI_VC_t *vc,
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_PIGGYBACKLOCKOPRECVCOMPLETE);
 
     /* This handler is triggered when we received all data of a lock queue
-       entry */
+     * entry */
 
     /* Note that if we decided to drop op data, here we just need to complete this
-       request; otherwise we try to get the lock again in this handler. */
+     * request; otherwise we try to get the lock again in this handler. */
     if (rreq->dev.lock_queue_entry != NULL) {
 
-    /* Mark all data received in lock queue entry */
-    lock_queue_entry->all_data_recved = 1;
+        /* Mark all data received in lock queue entry */
+        lock_queue_entry->all_data_recved = 1;
 
-    /* try to acquire the lock here */
-    MPIDI_CH3_PKT_RMA_GET_FLAGS(lock_queue_entry->pkt, flags, mpi_errno);
-    MPIDI_CH3_PKT_RMA_GET_TARGET_WIN_HANDLE(lock_queue_entry->pkt, target_win_handle, mpi_errno);
-    MPID_Win_get_ptr(target_win_handle, win_ptr);
+        /* try to acquire the lock here */
+        MPIDI_CH3_PKT_RMA_GET_FLAGS(lock_queue_entry->pkt, flags, mpi_errno);
+        MPIDI_CH3_PKT_RMA_GET_TARGET_WIN_HANDLE(lock_queue_entry->pkt, target_win_handle,
+                                                mpi_errno);
+        MPID_Win_get_ptr(target_win_handle, win_ptr);
 
-    if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED) {
-        requested_lock = MPI_LOCK_SHARED;
-    }
-    else {
-        MPIU_Assert(flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE);
-        requested_lock = MPI_LOCK_EXCLUSIVE;
-    }
+        if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED) {
+            requested_lock = MPI_LOCK_SHARED;
+        }
+        else {
+            MPIU_Assert(flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE);
+            requested_lock = MPI_LOCK_EXCLUSIVE;
+        }
 
-    if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, requested_lock) == 1) {
-        /* dequeue entry from lock queue */
-        MPL_LL_DELETE(win_ptr->lock_queue, win_ptr->lock_queue_tail, lock_queue_entry);
+        if (MPIDI_CH3I_Try_acquire_win_lock(win_ptr, requested_lock) == 1) {
+            /* dequeue entry from lock queue */
+            MPL_LL_DELETE(win_ptr->lock_queue, win_ptr->lock_queue_tail, lock_queue_entry);
 
-        /* perform this OP */
-        mpi_errno = perform_op_in_lock_queue(win_ptr, lock_queue_entry);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            /* perform this OP */
+            mpi_errno = perform_op_in_lock_queue(win_ptr, lock_queue_entry);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
 
-        /* free this entry */
-        mpi_errno = MPIDI_CH3I_Win_lock_entry_free(win_ptr, lock_queue_entry);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-    }
-    /* If try acquiring lock failed, just leave the lock queue entry in the queue with
-       all_data_recved marked as 1, release_lock() function will traverse the queue
-       and find entry with all_data_recved being 1 to grant the lock. */
+            /* free this entry */
+            mpi_errno = MPIDI_CH3I_Win_lock_entry_free(win_ptr, lock_queue_entry);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+        }
+        /* If try acquiring lock failed, just leave the lock queue entry in the queue with
+         * all_data_recved marked as 1, release_lock() function will traverse the queue
+         * and find entry with all_data_recved being 1 to grant the lock. */
     }
 
     /* mark receive data transfer as complete and decrement CC in receive
-       request */
+     * request */
     MPIDI_CH3U_Request_complete(rreq);
     *complete = TRUE;
 
- fn_fail:
+  fn_fail:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_PIGGYBACKLOCKOPRECVCOMPLETE);
     return mpi_errno;
 }
diff --git a/src/mpid/ch3/src/ch3u_handle_send_req.c b/src/mpid/ch3/src/ch3u_handle_send_req.c
index 6d0f81d..9580d63 100644
--- a/src/mpid/ch3/src/ch3u_handle_send_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_send_req.c
@@ -11,11 +11,10 @@
 #define FUNCNAME MPIDI_CH3U_Handle_send_req
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3U_Handle_send_req(MPIDI_VC_t * vc, MPID_Request * sreq, 
-			       int *complete)
+int MPIDI_CH3U_Handle_send_req(MPIDI_VC_t * vc, MPID_Request * sreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
-    int (*reqFn)(MPIDI_VC_t *, MPID_Request *, int *);
+    int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_HANDLE_SEND_REQ);
 
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_HANDLE_SEND_REQ);
@@ -24,12 +23,12 @@ int MPIDI_CH3U_Handle_send_req(MPIDI_VC_t * vc, MPID_Request * sreq,
     /* Routines can call the attached function directly */
     reqFn = sreq->dev.OnDataAvail;
     if (!reqFn) {
-	MPIU_Assert(MPIDI_Request_get_type(sreq) != MPIDI_REQUEST_TYPE_GET_RESP);
-	MPIDI_CH3U_Request_complete(sreq);
+        MPIU_Assert(MPIDI_Request_get_type(sreq) != MPIDI_REQUEST_TYPE_GET_RESP);
+        MPIDI_CH3U_Request_complete(sreq);
         *complete = 1;
     }
     else {
-	mpi_errno = reqFn( vc, sreq, complete );
+        mpi_errno = reqFn(vc, sreq, complete);
     }
 
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_HANDLE_SEND_REQ);
@@ -37,33 +36,32 @@ int MPIDI_CH3U_Handle_send_req(MPIDI_VC_t * vc, MPID_Request * sreq,
 }
 
 /* ----------------------------------------------------------------------- */
-/* Here are the functions that implement the actions that are taken when 
+/* Here are the functions that implement the actions that are taken when
  * data is available for a send request (or other completion operations)
  * These include "send" requests that are part of the RMA implementation.
  */
 /* ----------------------------------------------------------------------- */
 
-int MPIDI_CH3_ReqHandler_GetSendComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
-					      MPID_Request *sreq, 
-					      int *complete )
+int MPIDI_CH3_ReqHandler_GetSendComplete(MPIDI_VC_t * vc ATTRIBUTE((unused)),
+                                         MPID_Request * sreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Win *win_ptr;
     MPIDI_CH3_Pkt_flags_t flags = sreq->dev.flags;
 
     /* NOTE: It is possible that this request is already completed before
-       entering this handler. This happens when this req handler is called
-       within the same req handler on the same request.
-       Consider this case: req is queued up in SHM queue with ref count of 2:
-       one is for completing the request and another is for dequeueing from
-       the queue. The first called req handler on this request completed
-       this request and decrement ref counter to 1. Request is still in the
-       queue. Within this handler, we call the req handler on the same request
-       for the second time (for example when making progress on SHM queue),
-       and the second called handler also tries to complete this request,
-       which leads to wrong execution.
-       Here we check if req is already completed to prevent processing the
-       same request twice. */
+     * entering this handler. This happens when this req handler is called
+     * within the same req handler on the same request.
+     * Consider this case: req is queued up in SHM queue with ref count of 2:
+     * one is for completing the request and another is for dequeueing from
+     * the queue. The first called req handler on this request completed
+     * this request and decrement ref counter to 1. Request is still in the
+     * queue. Within this handler, we call the req handler on the same request
+     * for the second time (for example when making progress on SHM queue),
+     * and the second called handler also tries to complete this request,
+     * which leads to wrong execution.
+     * Here we check if req is already completed to prevent processing the
+     * same request twice. */
     if (MPID_Request_is_complete(sreq)) {
         *complete = FALSE;
         goto fn_exit;
@@ -72,7 +70,7 @@ int MPIDI_CH3_ReqHandler_GetSendComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
     MPID_Win_get_ptr(sreq->dev.target_win_handle, win_ptr);
 
     /* here we decrement the Active Target counter to guarantee the GET-like
-       operation are completed when counter reaches zero. */
+     * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter--;
     MPIU_Assert(win_ptr->at_completion_counter >= 0);
 
@@ -80,18 +78,19 @@ int MPIDI_CH3_ReqHandler_GetSendComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
     MPIDI_CH3U_Request_complete(sreq);
 
     /* NOTE: finish_op_on_target() must be called after we complete this request,
-       because inside finish_op_on_target() we may call this request handler
-       on the same request again (in release_lock()). Marking this request as
-       completed will prevent us from processing the same request twice. */
-    mpi_errno = finish_op_on_target(win_ptr, vc, TRUE /* has response data */,
+     * because inside finish_op_on_target() we may call this request handler
+     * on the same request again (in release_lock()). Marking this request as
+     * completed will prevent us from processing the same request twice. */
+    mpi_errno = finish_op_on_target(win_ptr, vc, TRUE /* has response data */ ,
                                     flags, MPI_WIN_NULL);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
 
     *complete = TRUE;
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -99,9 +98,7 @@ int MPIDI_CH3_ReqHandler_GetSendComplete( MPIDI_VC_t *vc ATTRIBUTE((unused)),
 #define FUNCNAME MPIDI_CH3_ReqHandler_GaccumSendComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_GaccumSendComplete( MPIDI_VC_t *vc,
-                                                 MPID_Request *rreq,
-                                                 int *complete )
+int MPIDI_CH3_ReqHandler_GaccumSendComplete(MPIDI_VC_t * vc, MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Win *win_ptr;
@@ -111,53 +108,54 @@ int MPIDI_CH3_ReqHandler_GaccumSendComplete( MPIDI_VC_t *vc,
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMSENDCOMPLETE);
 
     /* NOTE: It is possible that this request is already completed before
-       entering this handler. This happens when this req handler is called
-       within the same req handler on the same request.
-       Consider this case: req is queued up in SHM queue with ref count of 2:
-       one is for completing the request and another is for dequeueing from
-       the queue. The first called req handler on this request completed
-       this request and decrement ref counter to 1. Request is still in the
-       queue. Within this handler, we call the req handler on the same request
-       for the second time (for example when making progress on SHM queue),
-       and the second called handler also tries to complete this request,
-       which leads to wrong execution.
-       Here we check if req is already completed to prevent processing the
-       same request twice. */
+     * entering this handler. This happens when this req handler is called
+     * within the same req handler on the same request.
+     * Consider this case: req is queued up in SHM queue with ref count of 2:
+     * one is for completing the request and another is for dequeueing from
+     * the queue. The first called req handler on this request completed
+     * this request and decrement ref counter to 1. Request is still in the
+     * queue. Within this handler, we call the req handler on the same request
+     * for the second time (for example when making progress on SHM queue),
+     * and the second called handler also tries to complete this request,
+     * which leads to wrong execution.
+     * Here we check if req is already completed to prevent processing the
+     * same request twice. */
     if (MPID_Request_is_complete(rreq)) {
         *complete = FALSE;
         goto fn_exit;
     }
 
     /* This function is triggered when sending back process of GACC/FOP/CAS
-       is finished. Only GACC used user_buf. FOP and CAS can fit all data
-       in response packet. */
+     * is finished. Only GACC used user_buf. FOP and CAS can fit all data
+     * in response packet. */
     if (rreq->dev.user_buf != NULL)
         MPIU_Free(rreq->dev.user_buf);
 
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
     /* here we decrement the Active Target counter to guarantee the GET-like
-       operation are completed when counter reaches zero. */
+     * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter--;
     MPIU_Assert(win_ptr->at_completion_counter >= 0);
 
     MPIDI_CH3U_Request_complete(rreq);
 
     /* NOTE: finish_op_on_target() must be called after we complete this request,
-       because inside finish_op_on_target() we may call this request handler
-       on the same request again (in release_lock()). Marking this request as
-       completed will prevent us from processing the same request twice. */
-    mpi_errno = finish_op_on_target(win_ptr, vc, TRUE /* has response data */,
+     * because inside finish_op_on_target() we may call this request handler
+     * on the same request again (in release_lock()). Marking this request as
+     * completed will prevent us from processing the same request twice. */
+    mpi_errno = finish_op_on_target(win_ptr, vc, TRUE /* has response data */ ,
                                     flags, MPI_WIN_NULL);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
 
     *complete = TRUE;
 
- fn_exit:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_GACCUMSENDCOMPLETE);
     return mpi_errno;
 
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -166,9 +164,7 @@ int MPIDI_CH3_ReqHandler_GaccumSendComplete( MPIDI_VC_t *vc,
 #define FUNCNAME MPIDI_CH3_ReqHandler_CASSendComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_CASSendComplete( MPIDI_VC_t *vc,
-                                          MPID_Request *rreq,
-                                          int *complete )
+int MPIDI_CH3_ReqHandler_CASSendComplete(MPIDI_VC_t * vc, MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Win *win_ptr;
@@ -178,62 +174,62 @@ int MPIDI_CH3_ReqHandler_CASSendComplete( MPIDI_VC_t *vc,
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_CASSENDCOMPLETE);
 
     /* NOTE: It is possible that this request is already completed before
-       entering this handler. This happens when this req handler is called
-       within the same req handler on the same request.
-       Consider this case: req is queued up in SHM queue with ref count of 2:
-       one is for completing the request and another is for dequeueing from
-       the queue. The first called req handler on this request completed
-       this request and decrement ref counter to 1. Request is still in the
-       queue. Within this handler, we call the req handler on the same request
-       for the second time (for example when making progress on SHM queue),
-       and the second called handler also tries to complete this request,
-       which leads to wrong execution.
-       Here we check if req is already completed to prevent processing the
-       same request twice. */
+     * entering this handler. This happens when this req handler is called
+     * within the same req handler on the same request.
+     * Consider this case: req is queued up in SHM queue with ref count of 2:
+     * one is for completing the request and another is for dequeueing from
+     * the queue. The first called req handler on this request completed
+     * this request and decrement ref counter to 1. Request is still in the
+     * queue. Within this handler, we call the req handler on the same request
+     * for the second time (for example when making progress on SHM queue),
+     * and the second called handler also tries to complete this request,
+     * which leads to wrong execution.
+     * Here we check if req is already completed to prevent processing the
+     * same request twice. */
     if (MPID_Request_is_complete(rreq)) {
         *complete = FALSE;
         goto fn_exit;
     }
 
     /* This function is triggered when sending back process of GACC/FOP/CAS
-       is finished. Only GACC used user_buf. FOP and CAS can fit all data
-       in response packet. */
+     * is finished. Only GACC used user_buf. FOP and CAS can fit all data
+     * in response packet. */
     if (rreq->dev.user_buf != NULL)
         MPIU_Free(rreq->dev.user_buf);
 
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
     /* here we decrement the Active Target counter to guarantee the GET-like
-       operation are completed when counter reaches zero. */
+     * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter--;
     MPIU_Assert(win_ptr->at_completion_counter >= 0);
 
     MPIDI_CH3U_Request_complete(rreq);
 
     /* NOTE: finish_op_on_target() must be called after we complete this request,
-       because inside finish_op_on_target() we may call this request handler
-       on the same request again (in release_lock()). Marking this request as
-       completed will prevent us from processing the same request twice. */
-    mpi_errno = finish_op_on_target(win_ptr, vc, TRUE/* has response data */,
+     * because inside finish_op_on_target() we may call this request handler
+     * on the same request again (in release_lock()). Marking this request as
+     * completed will prevent us from processing the same request twice. */
+    mpi_errno = finish_op_on_target(win_ptr, vc, TRUE /* has response data */ ,
                                     flags, MPI_WIN_NULL);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
 
     *complete = TRUE;
 
- fn_exit:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_CASSENDCOMPLETE);
     return mpi_errno;
 
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
+
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3_ReqHandler_FOPSendComplete
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3_ReqHandler_FOPSendComplete( MPIDI_VC_t *vc,
-                                          MPID_Request *rreq,
-                                          int *complete )
+int MPIDI_CH3_ReqHandler_FOPSendComplete(MPIDI_VC_t * vc, MPID_Request * rreq, int *complete)
 {
     int mpi_errno = MPI_SUCCESS;
     MPID_Win *win_ptr;
@@ -243,59 +239,60 @@ int MPIDI_CH3_ReqHandler_FOPSendComplete( MPIDI_VC_t *vc,
     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_REQHANDLER_FOPSENDCOMPLETE);
 
     /* NOTE: It is possible that this request is already completed before
-       entering this handler. This happens when this req handler is called
-       within the same req handler on the same request.
-       Consider this case: req is queued up in SHM queue with ref count of 2:
-       one is for completing the request and another is for dequeueing from
-       the queue. The first called req handler on this request completed
-       this request and decrement ref counter to 1. Request is still in the
-       queue. Within this handler, we call the req handler on the same request
-       for the second time (for example when making progress on SHM queue),
-       and the second called handler also tries to complete this request,
-       which leads to wrong execution.
-       Here we check if req is already completed to prevent processing the
-       same request twice. */
+     * entering this handler. This happens when this req handler is called
+     * within the same req handler on the same request.
+     * Consider this case: req is queued up in SHM queue with ref count of 2:
+     * one is for completing the request and another is for dequeueing from
+     * the queue. The first called req handler on this request completed
+     * this request and decrement ref counter to 1. Request is still in the
+     * queue. Within this handler, we call the req handler on the same request
+     * for the second time (for example when making progress on SHM queue),
+     * and the second called handler also tries to complete this request,
+     * which leads to wrong execution.
+     * Here we check if req is already completed to prevent processing the
+     * same request twice. */
     if (MPID_Request_is_complete(rreq)) {
         *complete = FALSE;
         goto fn_exit;
     }
 
     /* This function is triggered when sending back process of GACC/FOP/CAS
-       is finished. Only GACC used user_buf. FOP and CAS can fit all data
-       in response packet. */
+     * is finished. Only GACC used user_buf. FOP and CAS can fit all data
+     * in response packet. */
     if (rreq->dev.user_buf != NULL)
         MPIU_Free(rreq->dev.user_buf);
 
     MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
 
     /* here we decrement the Active Target counter to guarantee the GET-like
-       operation are completed when counter reaches zero. */
+     * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter--;
     MPIU_Assert(win_ptr->at_completion_counter >= 0);
 
     MPIDI_CH3U_Request_complete(rreq);
 
     /* NOTE: finish_op_on_target() must be called after we complete this request,
-       because inside finish_op_on_target() we may call this request handler
-       on the same request again (in release_lock()). Marking this request as
-       completed will prevent us from processing the same request twice. */
-    mpi_errno = finish_op_on_target(win_ptr, vc, TRUE /* has response data */,
+     * because inside finish_op_on_target() we may call this request handler
+     * on the same request again (in release_lock()). Marking this request as
+     * completed will prevent us from processing the same request twice. */
+    mpi_errno = finish_op_on_target(win_ptr, vc, TRUE /* has response data */ ,
                                     flags, MPI_WIN_NULL);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
 
     *complete = TRUE;
 
- fn_exit:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_REQHANDLER_FOPSENDCOMPLETE);
     return mpi_errno;
 
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
 
-int MPIDI_CH3_ReqHandler_SendReloadIOV( MPIDI_VC_t *vc ATTRIBUTE((unused)), MPID_Request *sreq, 
-					int *complete )
+int MPIDI_CH3_ReqHandler_SendReloadIOV(MPIDI_VC_t * vc ATTRIBUTE((unused)), MPID_Request * sreq,
+                                       int *complete)
 {
     int mpi_errno;
 
@@ -303,14 +300,13 @@ int MPIDI_CH3_ReqHandler_SendReloadIOV( MPIDI_VC_t *vc ATTRIBUTE((unused)), MPID
      * not set in the _load_send_iov function */
     sreq->dev.iov_offset = 0;
     sreq->dev.iov_count = MPID_IOV_LIMIT;
-    mpi_errno = MPIDI_CH3U_Request_load_send_iov(sreq, sreq->dev.iov, 
-						 &sreq->dev.iov_count);
+    mpi_errno = MPIDI_CH3U_Request_load_send_iov(sreq, sreq->dev.iov, &sreq->dev.iov_count);
     if (mpi_errno != MPI_SUCCESS) {
-	MPIU_ERR_SETFATALANDJUMP(mpi_errno, MPI_ERR_OTHER,"**ch3|loadsendiov");
+        MPIU_ERR_SETFATALANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|loadsendiov");
     }
-	    
+
     *complete = FALSE;
 
- fn_fail:
+  fn_fail:
     return mpi_errno;
 }
diff --git a/src/mpid/ch3/src/ch3u_rma_oplist.c b/src/mpid/ch3/src/ch3u_rma_oplist.c
index 3d97174..96deee8 100644
--- a/src/mpid/ch3/src/ch3u_rma_oplist.c
+++ b/src/mpid/ch3/src/ch3u_rma_oplist.c
@@ -32,9 +32,11 @@ cvars:
 === END_MPI_T_CVAR_INFO_BLOCK ===
 */
 
-static inline int check_target_state(MPID_Win *win_ptr, MPIDI_RMA_Target_t *target, int *made_progress);
-static inline int check_window_state(MPID_Win *win_ptr, int *made_progress);
-static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *target, int *made_progress);
+static inline int check_target_state(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
+                                     int *made_progress);
+static inline int check_window_state(MPID_Win * win_ptr, int *made_progress);
+static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
+                                   int *made_progress);
 static inline int issue_ops_win(MPID_Win * win_ptr, int *made_progress);
 
 /* check if we can switch window-wide state: FENCE_ISSUED, PSCW_ISSUED, LOCK_ALL_ISSUED */
@@ -42,7 +44,7 @@ static inline int issue_ops_win(MPID_Win * win_ptr, int *made_progress);
 #define FUNCNAME check_window_state
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int check_window_state(MPID_Win *win_ptr, int *made_progress)
+static inline int check_window_state(MPID_Win * win_ptr, int *made_progress)
 {
     MPID_Request *fence_req_ptr = NULL;
     int i, mpi_errno = MPI_SUCCESS;
@@ -70,7 +72,7 @@ static inline int check_window_state(MPID_Win *win_ptr, int *made_progress)
     case MPIDI_RMA_PSCW_ISSUED:
         if (win_ptr->start_req == NULL) {
             /* for MPI_MODE_NOCHECK and all targets on SHM,
-               we do not create PSCW requests on window. */
+             * we do not create PSCW requests on window. */
             win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED;
 
             num_active_issued_win--;
@@ -116,7 +118,7 @@ static inline int check_window_state(MPID_Win *win_ptr, int *made_progress)
 
     default:
         break;
-    } /* end of switch */
+    }   /* end of switch */
 
   fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_CHECK_WINDOW_STATE);
@@ -132,7 +134,7 @@ static inline int check_window_state(MPID_Win *win_ptr, int *made_progress)
 #define FUNCNAME check_target_state
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int check_target_state(MPID_Win *win_ptr, MPIDI_RMA_Target_t *target,
+static inline int check_target_state(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
                                      int *made_progress)
 {
     int rank = win_ptr->comm_ptr->rank;
@@ -144,7 +146,7 @@ static inline int check_target_state(MPID_Win *win_ptr, MPIDI_RMA_Target_t *targ
         goto fn_exit;
 
     /* This check should only be performed when window-wide sync is finished, or
-       current sync is per-target sync. */
+     * current sync is per-target sync. */
     if (win_ptr->states.access_state == MPIDI_RMA_NONE ||
         win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
         win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
@@ -163,12 +165,13 @@ static inline int check_target_state(MPID_Win *win_ptr, MPIDI_RMA_Target_t *targ
                 target->access_state = MPIDI_RMA_LOCK_ISSUED;
                 if (target->target_rank == rank) {
                     mpi_errno = acquire_local_lock(win_ptr, target->lock_type);
-                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno != MPI_SUCCESS)
+                        MPIU_ERR_POP(mpi_errno);
                 }
                 else {
-                    mpi_errno = send_lock_msg(target->target_rank,
-                                              target->lock_type, win_ptr);
-                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                    mpi_errno = send_lock_msg(target->target_rank, target->lock_type, win_ptr);
+                    if (mpi_errno != MPI_SUCCESS)
+                        MPIU_ERR_POP(mpi_errno);
                 }
 
                 (*made_progress) = 1;
@@ -177,8 +180,8 @@ static inline int check_target_state(MPID_Win *win_ptr, MPIDI_RMA_Target_t *targ
         else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) {
             if (target->pending_op_list == NULL) {
                 /* No RMA operation has ever been posted to this target,
-                   finish issuing, no need to acquire the lock. Cleanup
-                   function will clean it up. */
+                 * finish issuing, no need to acquire the lock. Cleanup
+                 * function will clean it up. */
                 target->access_state = MPIDI_RMA_LOCK_GRANTED;
 
                 target->sync.outstanding_acks--;
@@ -191,8 +194,8 @@ static inline int check_target_state(MPID_Win *win_ptr, MPIDI_RMA_Target_t *targ
             }
             else {
                 /* if we reach WIN_UNLOCK and there is still operation existing
-                   in pending list, this operation must be the only operation
-                   and it is prepared to piggyback LOCK and UNLOCK. */
+                 * in pending list, this operation must be the only operation
+                 * and it is prepared to piggyback LOCK and UNLOCK. */
                 MPIU_Assert(target->pending_op_list->next == NULL);
                 MPIU_Assert(target->pending_op_list->piggyback_lock_candidate);
             }
@@ -210,12 +213,13 @@ static inline int check_target_state(MPID_Win *win_ptr, MPIDI_RMA_Target_t *targ
                 else {
                     if (target->put_acc_issued) {
                         mpi_errno = send_flush_msg(target->target_rank, win_ptr);
-                        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                        if (mpi_errno != MPI_SUCCESS)
+                            MPIU_ERR_POP(mpi_errno);
                     }
                     else {
                         /* We did not issue PUT/ACC since the last
-                           synchronization call, therefore here we
-                           don't need ACK back */
+                         * synchronization call, therefore here we
+                         * don't need ACK back */
                         target->sync.outstanding_acks--;
                         MPIU_Assert(target->sync.outstanding_acks >= 0);
                     }
@@ -234,21 +238,23 @@ static inline int check_target_state(MPID_Win *win_ptr, MPIDI_RMA_Target_t *targ
                     MPIU_Assert(target->sync.outstanding_acks >= 0);
 
                     mpi_errno = MPIDI_CH3I_Release_lock(win_ptr);
-                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno != MPI_SUCCESS)
+                        MPIU_ERR_POP(mpi_errno);
                 }
                 else {
                     MPIDI_CH3_Pkt_flags_t flag = MPIDI_CH3_PKT_FLAG_NONE;
                     if (!target->put_acc_issued) {
                         /* We did not issue PUT/ACC since the last
-                           synchronization call, therefore here we
-                           don't need ACK back */
+                         * synchronization call, therefore here we
+                         * don't need ACK back */
                         target->sync.outstanding_acks--;
                         MPIU_Assert(target->sync.outstanding_acks >= 0);
 
                         flag = MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK;
                     }
                     mpi_errno = send_unlock_msg(target->target_rank, win_ptr, flag);
-                    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                    if (mpi_errno != MPI_SUCCESS)
+                        MPIU_ERR_POP(mpi_errno);
                 }
 
                 /* We are done with ending synchronization, unset target's sync_flag. */
@@ -261,11 +267,11 @@ static inline int check_target_state(MPID_Win *win_ptr, MPIDI_RMA_Target_t *targ
 
     default:
         break;
-    } /* end of switch */
+    }   /* end of switch */
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -274,7 +280,7 @@ static inline int check_target_state(MPID_Win *win_ptr, MPIDI_RMA_Target_t *targ
 #define FUNCNAME issue_ops_target
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *target,
+static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target,
                                    int *made_progress)
 {
     MPIDI_RMA_Op_t *curr_op = NULL;
@@ -306,19 +312,18 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *targe
 
         if (target->access_state == MPIDI_RMA_LOCK_ISSUED) {
             /* It is possible that the previous OP+LOCK changes
-               lock state to LOCK_ISSUED. */
+             * lock state to LOCK_ISSUED. */
             break;
         }
 
         if (curr_op->next == NULL &&
-            target->sync.sync_flag == MPIDI_RMA_SYNC_NONE &&
-            curr_op->ureq == NULL) {
+            target->sync.sync_flag == MPIDI_RMA_SYNC_NONE && curr_op->ureq == NULL) {
             /* Skip the last OP if sync_flag is NONE since we
-               want to leave it to the ending synchronization
-               so that we can piggyback LOCK / FLUSH.
-               However, if it is a request-based RMA, do not
-               skip it (otherwise a wait call before unlock
-               will be blocked). */
+             * want to leave it to the ending synchronization
+             * so that we can piggyback LOCK / FLUSH.
+             * However, if it is a request-based RMA, do not
+             * skip it (otherwise a wait call before unlock
+             * will be blocked). */
             break;
         }
 
@@ -370,15 +375,15 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *targe
             curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE ||
             curr_op->pkt.type == MPIDI_CH3_PKT_ACCUMULATE_IMMED) {
             target->put_acc_issued = 1; /* set PUT_ACC_FLAG when sending
-                                           PUT/ACC operation. */
+                                         * PUT/ACC operation. */
         }
 
         if (flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
             flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE) {
             /* If this operation is piggybacked with LOCK,
-               do not move it out of pending list, and do
-               not complete the user request, because we
-               may need to re-transmit it. */
+             * do not move it out of pending list, and do
+             * not complete the user request, because we
+             * may need to re-transmit it. */
             break;
         }
 
@@ -414,7 +419,7 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *targe
                                           &(target->read_op_list_tail), curr_op);
             }
 
-            /* Setup user request info in order to be completed following send request.*/
+            /* Setup user request info in order to be completed following send request. */
             if (curr_op->ureq) {
                 /* Increase ref for completion handler */
                 MPIU_Object_add_ref(curr_op->ureq);
@@ -446,7 +451,7 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *targe
 
         curr_op = target->next_op_to_issue;
 
-    } /* end of while loop */
+    }   /* end of while loop */
 
   fn_exit:
     return mpi_errno;
@@ -458,7 +463,7 @@ static inline int issue_ops_target(MPID_Win * win_ptr, MPIDI_RMA_Target_t *targe
 #define FUNCNAME issue_ops_win
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-static inline int issue_ops_win(MPID_Win *win_ptr, int *made_progress)
+static inline int issue_ops_win(MPID_Win * win_ptr, int *made_progress)
 {
     int mpi_errno = MPI_SUCCESS;
     int start_slot, end_slot, i, idx;
@@ -481,8 +486,10 @@ static inline int issue_ops_win(MPID_Win *win_ptr, int *made_progress)
     start_slot = win_ptr->comm_ptr->rank % win_ptr->num_slots;
     end_slot = start_slot + win_ptr->num_slots;
     for (i = start_slot; i < end_slot; i++) {
-        if (i < win_ptr->num_slots) idx = i;
-        else idx = i - win_ptr->num_slots;
+        if (i < win_ptr->num_slots)
+            idx = i;
+        else
+            idx = i - win_ptr->num_slots;
 
         target = win_ptr->slots[idx].target_list;
         while (target != NULL) {
@@ -490,21 +497,25 @@ static inline int issue_ops_win(MPID_Win *win_ptr, int *made_progress)
 
             /* check target state */
             mpi_errno = check_target_state(win_ptr, target, &temp_progress);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-            if (temp_progress) (*made_progress) = 1;
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+            if (temp_progress)
+                (*made_progress) = 1;
 
             /* issue operations to this target */
             mpi_errno = issue_ops_target(win_ptr, target, &temp_progress);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-            if (temp_progress) (*made_progress) = 1;
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
+            if (temp_progress)
+                (*made_progress) = 1;
 
             target = target->next;
         }
     }
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -525,12 +536,12 @@ int MPIDI_CH3I_RMA_Free_ops_before_completion(MPID_Win * win_ptr)
     /* If we are in an free_ops_before_completion, the window must be holding
      * up resources.  If it isn't, we are in the wrong window and
      * incorrectly entered this function. */
-    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER,
-                        "**rmanoop");
+    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanoop");
 
     /* make nonblocking progress once */
     mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     if (win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED ||
         win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED ||
@@ -542,8 +553,7 @@ int MPIDI_CH3I_RMA_Free_ops_before_completion(MPID_Win * win_ptr)
         if (win_ptr->slots[i].target_list != NULL) {
             curr_target = win_ptr->slots[i].target_list;
             while (curr_target != NULL) {
-                if (curr_target->read_op_list != NULL ||
-                    curr_target->write_op_list != NULL) {
+                if (curr_target->read_op_list != NULL || curr_target->write_op_list != NULL) {
                     if (win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
                         win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
                         if (curr_target->access_state == MPIDI_RMA_LOCK_GRANTED)
@@ -555,14 +565,16 @@ int MPIDI_CH3I_RMA_Free_ops_before_completion(MPID_Win * win_ptr)
                 }
                 curr_target = curr_target->next;
             }
-            if (curr_target != NULL) break;
+            if (curr_target != NULL)
+                break;
         }
     }
 
-    if (curr_target == NULL) goto fn_exit;
+    if (curr_target == NULL)
+        goto fn_exit;
 
     /* After we do this, all following Win_flush_local
-       must do a Win_flush instead. */
+     * must do a Win_flush instead. */
     curr_target->disable_flush_local = 1;
 
     if (curr_target->read_op_list != NULL) {
@@ -576,7 +588,7 @@ int MPIDI_CH3I_RMA_Free_ops_before_completion(MPID_Win * win_ptr)
     }
 
     /* free all ops in the list since we do not need to maintain them anymore */
-    for (curr_op = *op_list; curr_op != NULL; ) {
+    for (curr_op = *op_list; curr_op != NULL;) {
         MPID_Request_release(curr_op->request);
         MPL_LL_DELETE(*op_list, *op_list_tail, curr_op);
         MPIDI_CH3I_Win_op_free(win_ptr, curr_op);
@@ -591,11 +603,11 @@ int MPIDI_CH3I_RMA_Free_ops_before_completion(MPID_Win * win_ptr)
             }
         }
         curr_op = *op_list;
-   }
+    }
 
- fn_exit:
+  fn_exit:
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -614,8 +626,7 @@ int MPIDI_CH3I_RMA_Cleanup_ops_aggressive(MPID_Win * win_ptr)
     /* If we are in an aggressive cleanup, the window must be holding
      * up resources.  If it isn't, we are in the wrong window and
      * incorrectly entered this function. */
-    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER,
-                        "**rmanoop");
+    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanoop");
 
     /* find the first target that has something to issue */
     for (i = 0; i < win_ptr->num_slots; i++) {
@@ -623,11 +634,13 @@ int MPIDI_CH3I_RMA_Cleanup_ops_aggressive(MPID_Win * win_ptr)
             curr_target = win_ptr->slots[i].target_list;
             while (curr_target != NULL && curr_target->pending_op_list == NULL)
                 curr_target = curr_target->next;
-            if (curr_target != NULL) break;
+            if (curr_target != NULL)
+                break;
         }
     }
 
-    if (curr_target == NULL) goto fn_exit;
+    if (curr_target == NULL)
+        goto fn_exit;
 
     if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH_LOCAL)
         curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH_LOCAL;
@@ -641,8 +654,7 @@ int MPIDI_CH3I_RMA_Cleanup_ops_aggressive(MPID_Win * win_ptr)
     /* Wait for local completion. */
     do {
         mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target,
-                                                      &local_completed,
-                                                      &remote_completed);
+                                                      &local_completed, &remote_completed);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
         if (!local_completed) {
@@ -675,8 +687,7 @@ int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Targe
     /* If we are in an aggressive cleanup, the window must be holding
      * up resources.  If it isn't, we are in the wrong window and
      * incorrectly entered this function. */
-    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER,
-                        "**rmanotarget");
+    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanotarget");
 
     if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
         /* switch to window-wide protocol */
@@ -688,7 +699,8 @@ int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Targe
             MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
             if (orig_vc->node_id != target_vc->node_id) {
                 mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, i, &curr_target);
-                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno)
+                    MPIU_ERR_POP(mpi_errno);
                 if (curr_target == NULL) {
                     win_ptr->outstanding_locks++;
                     mpi_errno = send_lock_msg(i, MPI_LOCK_SHARED, win_ptr);
@@ -723,8 +735,7 @@ int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Targe
         /* Wait for remote completion. */
         do {
             mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target,
-                                                          &local_completed,
-                                                          &remote_completed);
+                                                          &local_completed, &remote_completed);
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
             if (!remote_completed) {
@@ -736,7 +747,8 @@ int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Targe
 
         /* Cleanup the target. */
         mpi_errno = MPIDI_CH3I_RMA_Cleanup_single_target(win_ptr, curr_target);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         /* check if we got a target */
         (*target) = MPIDI_CH3I_Win_target_alloc(win_ptr);
@@ -764,22 +776,29 @@ int MPIDI_CH3I_RMA_Make_progress_target(MPID_Win * win_ptr, int target_rank, int
 
     /* check window state */
     mpi_errno = check_window_state(win_ptr, &temp_progress);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-    if (temp_progress) (*made_progress) = 1;
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+    if (temp_progress)
+        (*made_progress) = 1;
 
     /* find target element */
     mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, target_rank, &target);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     /* check target state */
     mpi_errno = check_target_state(win_ptr, target, &temp_progress);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-    if (temp_progress) (*made_progress) = 1;
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+    if (temp_progress)
+        (*made_progress) = 1;
 
     /* issue operations to this target */
     mpi_errno = issue_ops_target(win_ptr, target, &temp_progress);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-    if (temp_progress) (*made_progress) = 1;
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+    if (temp_progress)
+        (*made_progress) = 1;
 
   fn_exit:
     return mpi_errno;
@@ -801,13 +820,17 @@ int MPIDI_CH3I_RMA_Make_progress_win(MPID_Win * win_ptr, int *made_progress)
 
     /* check window state */
     mpi_errno = check_window_state(win_ptr, &temp_progress);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-    if (temp_progress) (*made_progress) = 1;
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+    if (temp_progress)
+        (*made_progress) = 1;
 
     /* issue operations on window */
     mpi_errno = issue_ops_win(win_ptr, &temp_progress);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-    if (temp_progress) (*made_progress) = 1;
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
+    if (temp_progress)
+        (*made_progress) = 1;
 
   fn_exit:
     return mpi_errno;
@@ -836,8 +859,10 @@ int MPIDI_CH3I_RMA_Make_progress_global(int *made_progress)
             continue;
 
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_elem->win_ptr, &temp_progress);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-        if (temp_progress) (*made_progress) = 1;
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
+        if (temp_progress)
+            (*made_progress) = 1;
     }
 
   fn_exit:
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 5497210..962553b 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -127,7 +127,8 @@ int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
 
         /* queue it up */
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
 
@@ -138,7 +139,7 @@ int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
         new_ptr->origin_count = origin_count;
         new_ptr->origin_datatype = origin_datatype;
         new_ptr->target_rank = target_rank;
-        new_ptr->ureq = NULL; /* reset user request */
+        new_ptr->ureq = NULL;   /* reset user request */
 
         /* Remember user request */
         if (ureq) {
@@ -164,7 +165,7 @@ int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
         /* Judge if we can use IMMED data packet */
         if (!new_ptr->is_dt) {
             MPIU_Assign_trunc(immed_len,
-                              (MPIDI_RMA_IMMED_BYTES/origin_type_size)*origin_type_size,
+                              (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
                               size_t);
             if (len <= immed_len)
                 use_immed_pkt = TRUE;
@@ -196,9 +197,10 @@ int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
         put_pkt->source_win_handle = win_ptr->handle;
         put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
         if (use_immed_pkt) {
-            void *src = (void *)origin_addr, *dest = (void *)(put_pkt->info.data);
+            void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data);
             mpi_errno = immed_copy(src, dest, len);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
         }
 
         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
@@ -208,7 +210,8 @@ int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
             MPIU_ERR_POP(mpi_errno);
 
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
             win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
@@ -315,7 +318,8 @@ int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
 
         /* queue it up */
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
 
@@ -326,7 +330,7 @@ int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
         new_ptr->origin_count = origin_count;
         new_ptr->origin_datatype = origin_datatype;
         new_ptr->target_rank = target_rank;
-        new_ptr->ureq = NULL; /* reset user request */
+        new_ptr->ureq = NULL;   /* reset user request */
 
         /* Remember user request */
         if (ureq) {
@@ -352,7 +356,7 @@ int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
         /* Judge if we can use IMMED data response packet */
         if (!new_ptr->is_dt) {
             MPIU_Assign_trunc(immed_len,
-                              (MPIDI_RMA_IMMED_BYTES/target_type_size)*target_type_size,
+                              (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
                               size_t);
             if (len <= immed_len)
                 use_immed_resp_pkt = TRUE;
@@ -384,7 +388,8 @@ int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
             MPIU_ERR_POP(mpi_errno);
 
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
             win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
@@ -493,7 +498,8 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
 
         /* queue it up */
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
 
@@ -503,7 +509,7 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
         new_ptr->origin_count = origin_count;
         new_ptr->origin_datatype = origin_datatype;
         new_ptr->target_rank = target_rank;
-        new_ptr->ureq = NULL; /* reset user request */
+        new_ptr->ureq = NULL;   /* reset user request */
 
         /* Remember user request */
         if (ureq) {
@@ -529,7 +535,7 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
         /* Judge if we can use IMMED data packet */
         if (!new_ptr->is_dt) {
             MPIU_Assign_trunc(immed_len,
-                              (MPIDI_RMA_IMMED_BYTES/origin_type_size)*origin_type_size,
+                              (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
                               size_t);
             if (len <= immed_len)
                 use_immed_pkt = TRUE;
@@ -562,9 +568,10 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
         accum_pkt->source_win_handle = win_ptr->handle;
         accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
         if (use_immed_pkt) {
-            void *src = (void *)origin_addr, *dest = (void *)(accum_pkt->info.data);
+            void *src = (void *) origin_addr, *dest = (void *) (accum_pkt->info.data);
             mpi_errno = immed_copy(src, dest, len);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
         }
 
         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
@@ -574,7 +581,8 @@ int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatyp
             MPIU_ERR_POP(mpi_errno);
 
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
             win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
@@ -682,7 +690,8 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
 
         /* Append the operation to the window's RMA ops queue */
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */
 
@@ -701,7 +710,7 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             new_ptr->origin_count = result_count;
             new_ptr->origin_datatype = result_datatype;
             new_ptr->target_rank = target_rank;
-            new_ptr->ureq = NULL; /* reset user request */
+            new_ptr->ureq = NULL;       /* reset user request */
 
             /* Remember user request */
             if (ureq) {
@@ -725,7 +734,7 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             /* Judge if we can use IMMED data response packet */
             if (!new_ptr->is_dt) {
                 MPIU_Assign_trunc(immed_len,
-                                  (MPIDI_RMA_IMMED_BYTES/target_type_size)*target_type_size,
+                                  (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
                                   size_t);
                 if (len <= immed_len)
                     use_immed_resp_pkt = TRUE;
@@ -766,7 +775,7 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             new_ptr->result_count = result_count;
             new_ptr->result_datatype = result_datatype;
             new_ptr->target_rank = target_rank;
-            new_ptr->ureq = NULL; /* reset user request */
+            new_ptr->ureq = NULL;       /* reset user request */
 
             /* Remember user request */
             if (ureq) {
@@ -800,13 +809,13 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             /* Judge if we can use IMMED data packet */
             if (!new_ptr->is_dt) {
                 MPIU_Assign_trunc(immed_len,
-                                  (MPIDI_RMA_IMMED_BYTES/origin_type_size)*origin_type_size,
+                                  (MPIDI_RMA_IMMED_BYTES / origin_type_size) * origin_type_size,
                                   size_t);
                 if (orig_len <= immed_len)
                     use_immed_pkt = TRUE;
 
                 MPIU_Assign_trunc(immed_len,
-                                  (MPIDI_RMA_IMMED_BYTES/target_type_size)*target_type_size,
+                                  (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
                                   size_t);
                 if (tar_len <= immed_len)
                     use_immed_resp_pkt = TRUE;
@@ -838,9 +847,10 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             get_accum_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
             get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
             if (use_immed_pkt) {
-                void *src = (void *)origin_addr, *dest = (void *)(get_accum_pkt->info.data);
+                void *src = (void *) origin_addr, *dest = (void *) (get_accum_pkt->info.data);
                 mpi_errno = immed_copy(src, dest, orig_len);
-                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
             }
             if (use_immed_resp_pkt)
                 get_accum_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
@@ -853,7 +863,8 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
             MPIU_ERR_POP(mpi_errno);
 
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
             win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
@@ -1062,7 +1073,8 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
 
         /* Append this operation to the RMA ops queue */
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
 
@@ -1076,8 +1088,8 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         new_ptr->compare_addr = (void *) compare_addr;
         new_ptr->compare_datatype = datatype;
         new_ptr->target_rank = target_rank;
-        new_ptr->piggyback_lock_candidate = 1; /* CAS is always able to piggyback LOCK */
-        new_ptr->ureq = NULL; /* reset user request */
+        new_ptr->piggyback_lock_candidate = 1;  /* CAS is always able to piggyback LOCK */
+        new_ptr->ureq = NULL;   /* reset user request */
 
         /************** Setting packet struct areas in operation ****************/
 
@@ -1094,13 +1106,15 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
         MPID_Datatype_get_size_macro(datatype, type_size);
         MPIU_Assert(type_size <= sizeof(MPIDI_CH3_CAS_Immed_u));
 
-        src = (void *)origin_addr, dest = (void *)(&(cas_pkt->origin_data));
+        src = (void *) origin_addr, dest = (void *) (&(cas_pkt->origin_data));
         mpi_errno = immed_copy(src, dest, type_size);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
-        src = (void *)compare_addr, dest = (void *)(&(cas_pkt->compare_data));
+        src = (void *) compare_addr, dest = (void *) (&(cas_pkt->compare_data));
         mpi_errno = immed_copy(src, dest, type_size);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
 
@@ -1109,7 +1123,8 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
             MPIU_ERR_POP(mpi_errno);
 
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
             win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
@@ -1201,7 +1216,8 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
 
         /* Append this operation to the RMA ops queue */
         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &new_ptr);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
 
@@ -1219,7 +1235,7 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             new_ptr->origin_datatype = datatype;
             new_ptr->target_rank = target_rank;
             new_ptr->piggyback_lock_candidate = 1;
-            new_ptr->ureq = NULL; /* reset user request */
+            new_ptr->ureq = NULL;       /* reset user request */
 
             /************** Setting packet struct areas in operation ****************/
 
@@ -1228,7 +1244,7 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
 
             /* Judege if we can use IMMED data for response packet */
             MPIU_Assign_trunc(immed_len,
-                              (MPIDI_RMA_IMMED_BYTES/target_type_size)*target_type_size,
+                              (MPIDI_RMA_IMMED_BYTES / target_type_size) * target_type_size,
                               size_t);
             if (target_type_size <= immed_len)
                 use_immed_resp_pkt = TRUE;
@@ -1260,7 +1276,7 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             new_ptr->result_datatype = datatype;
             new_ptr->target_rank = target_rank;
             new_ptr->piggyback_lock_candidate = 1;
-            new_ptr->ureq = NULL; /* reset user request */
+            new_ptr->ureq = NULL;       /* reset user request */
 
             /************** Setting packet struct areas in operation ****************/
 
@@ -1268,9 +1284,7 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             MPIU_Assert(type_size <= sizeof(MPIDI_CH3_FOP_Immed_u));
 
             /* Judge if we can use IMMED data packet */
-            MPIU_Assign_trunc(immed_len,
-                              (MPIDI_RMA_IMMED_BYTES/type_size)*type_size,
-                              size_t);
+            MPIU_Assign_trunc(immed_len, (MPIDI_RMA_IMMED_BYTES / type_size) * type_size, size_t);
             if (type_size <= immed_len) {
                 use_immed_pkt = TRUE;
                 use_immed_resp_pkt = TRUE;
@@ -1291,9 +1305,10 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             fop_pkt->target_win_handle = win_ptr->all_win_handles[target_rank];
             fop_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
             if (use_immed_pkt) {
-                void *src = (void *)origin_addr, *dest = (void *)(fop_pkt->info.data);
+                void *src = (void *) origin_addr, *dest = (void *) (fop_pkt->info.data);
                 mpi_errno = immed_copy(src, dest, type_size);
-                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
             }
             if (use_immed_resp_pkt)
                 fop_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
@@ -1306,7 +1321,8 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
             MPIU_ERR_POP(mpi_errno);
 
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         if (MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS >= 0 &&
             win_ptr->accumulated_ops_cnt >= MPIR_CVAR_CH3_RMA_OP_POKING_PROGRESS) {
diff --git a/src/mpid/ch3/src/ch3u_rma_pkthandler.c b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
index ab77790..227f153 100644
--- a/src/mpid/ch3/src/ch3u_rma_pkthandler.c
+++ b/src/mpid/ch3/src/ch3u_rma_pkthandler.c
@@ -96,7 +96,8 @@ void MPIDI_CH3_RMA_Init_pkthandler_pvars(void)
                                       MPI_T_VERBOSITY_MPIDEV_DETAIL,
                                       MPI_T_BIND_NO_OBJECT,
                                       MPIR_T_PVAR_FLAG_READONLY,
-                                      "RMA", "RMA:PKTHANDLER for Get-Accumulate response (in seconds)");
+                                      "RMA",
+                                      "RMA:PKTHANDLER for Get-Accumulate response (in seconds)");
 
     /* rma_rmapkt_cas_resp */
     MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
@@ -105,7 +106,8 @@ void MPIDI_CH3_RMA_Init_pkthandler_pvars(void)
                                       MPI_T_VERBOSITY_MPIDEV_DETAIL,
                                       MPI_T_BIND_NO_OBJECT,
                                       MPIR_T_PVAR_FLAG_READONLY,
-                                      "RMA", "RMA:PKTHANDLER for Compare-and-Swap response (in seconds)");
+                                      "RMA",
+                                      "RMA:PKTHANDLER for Compare-and-Swap response (in seconds)");
 
     /* rma_rmapkt_fop_resp */
     MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
@@ -114,7 +116,8 @@ void MPIDI_CH3_RMA_Init_pkthandler_pvars(void)
                                       MPI_T_VERBOSITY_MPIDEV_DETAIL,
                                       MPI_T_BIND_NO_OBJECT,
                                       MPIR_T_PVAR_FLAG_READONLY,
-                                      "RMA", "RMA:PKTHANDLER for Fetch-and-op response (in seconds)");
+                                      "RMA",
+                                      "RMA:PKTHANDLER for Fetch-and-op response (in seconds)");
 
     /* rma_rmapkt_lock */
     MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
@@ -205,9 +208,9 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPIU_Assert(put_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(put_pkt->target_win_handle, win_ptr);
 
-    mpi_errno = check_piggyback_lock(win_ptr, vc, pkt, buflen,
-                                     &acquire_lock_fail, &req);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    mpi_errno = check_piggyback_lock(win_ptr, vc, pkt, buflen, &acquire_lock_fail, &req);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     if (acquire_lock_fail) {
         (*rreqp) = req;
@@ -222,116 +225,118 @@ int MPIDI_CH3_PktHandler_Put(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         MPIU_Assert(MPIR_DATATYPE_IS_PREDEFINED(put_pkt->datatype));
 
         /* copy data from packet header to target buffer */
-        MPIU_Memcpy(put_pkt->addr, put_pkt->info.data, put_pkt->count*type_size);
+        MPIU_Memcpy(put_pkt->addr, put_pkt->info.data, put_pkt->count * type_size);
 
         /* trigger final action */
-        mpi_errno = finish_op_on_target(win_ptr, vc, FALSE /* has no response data */,
+        mpi_errno = finish_op_on_target(win_ptr, vc, FALSE /* has no response data */ ,
                                         put_pkt->flags, put_pkt->source_win_handle);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         *buflen = sizeof(MPIDI_CH3_Pkt_t);
         *rreqp = NULL;
     }
     else {
-    MPIU_Assert(pkt->type == MPIDI_CH3_PKT_PUT);
+        MPIU_Assert(pkt->type == MPIDI_CH3_PKT_PUT);
 
-    /* get start location of data and length of data */
-    data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
-    data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
+        /* get start location of data and length of data */
+        data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
+        data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
-    req = MPID_Request_create();
-    MPIU_Object_set_ref(req, 1);
+        req = MPID_Request_create();
+        MPIU_Object_set_ref(req, 1);
 
-    req->dev.user_buf = put_pkt->addr;
-    req->dev.user_count = put_pkt->count;
-    req->dev.target_win_handle = put_pkt->target_win_handle;
-    req->dev.source_win_handle = put_pkt->source_win_handle;
-    req->dev.flags = put_pkt->flags;
-    req->dev.OnFinal = MPIDI_CH3_ReqHandler_PutRecvComplete;
+        req->dev.user_buf = put_pkt->addr;
+        req->dev.user_count = put_pkt->count;
+        req->dev.target_win_handle = put_pkt->target_win_handle;
+        req->dev.source_win_handle = put_pkt->source_win_handle;
+        req->dev.flags = put_pkt->flags;
+        req->dev.OnFinal = MPIDI_CH3_ReqHandler_PutRecvComplete;
 
-    if (MPIR_DATATYPE_IS_PREDEFINED(put_pkt->datatype)) {
-        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RESP);
-        req->dev.datatype = put_pkt->datatype;
+        if (MPIR_DATATYPE_IS_PREDEFINED(put_pkt->datatype)) {
+            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RESP);
+            req->dev.datatype = put_pkt->datatype;
 
-        req->dev.recv_data_sz = type_size * put_pkt->count;
-        MPIU_Assert(req->dev.recv_data_sz > 0);
+            req->dev.recv_data_sz = type_size * put_pkt->count;
+            MPIU_Assert(req->dev.recv_data_sz > 0);
 
-        mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
-        MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
-                             "**ch3|postrecv %s", "MPIDI_CH3_PKT_PUT");
+            mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
+            MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                                 "**ch3|postrecv %s", "MPIDI_CH3_PKT_PUT");
 
-        /* return the number of bytes processed in this function */
-        *buflen = sizeof(MPIDI_CH3_Pkt_t) + data_len;
+            /* return the number of bytes processed in this function */
+            *buflen = sizeof(MPIDI_CH3_Pkt_t) + data_len;
 
-        if (complete) {
-            mpi_errno = MPIDI_CH3_ReqHandler_PutRecvComplete(vc, req, &complete);
-            if (mpi_errno)
-                MPIU_ERR_POP(mpi_errno);
             if (complete) {
-                *rreqp = NULL;
-                goto fn_exit;
+                mpi_errno = MPIDI_CH3_ReqHandler_PutRecvComplete(vc, req, &complete);
+                if (mpi_errno)
+                    MPIU_ERR_POP(mpi_errno);
+                if (complete) {
+                    *rreqp = NULL;
+                    goto fn_exit;
+                }
             }
         }
-    }
-    else {
-        /* derived datatype */
-        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RESP_DERIVED_DT);
-        req->dev.datatype = MPI_DATATYPE_NULL;
-
-        req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
-            MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
-        if (!req->dev.dtype_info) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                 "MPIDI_RMA_dtype_info");
-        }
+        else {
+            /* derived datatype */
+            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_PUT_RESP_DERIVED_DT);
+            req->dev.datatype = MPI_DATATYPE_NULL;
+
+            req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
+                MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
+            if (!req->dev.dtype_info) {
+                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
+                                     "MPIDI_RMA_dtype_info");
+            }
 
-        req->dev.dataloop = MPIU_Malloc(put_pkt->info.dataloop_size);
-        if (!req->dev.dataloop) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                 put_pkt->info.dataloop_size);
-        }
+            req->dev.dataloop = MPIU_Malloc(put_pkt->info.dataloop_size);
+            if (!req->dev.dataloop) {
+                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
+                                     put_pkt->info.dataloop_size);
+            }
 
-        /* if we received all of the dtype_info and dataloop, copy it
-         * now and call the handler, otherwise set the iov and let the
-         * channel copy it */
-        if (data_len >= sizeof(MPIDI_RMA_dtype_info) + put_pkt->info.dataloop_size) {
-            /* copy all of dtype_info and dataloop */
-            MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
-            MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
-                        put_pkt->info.dataloop_size);
+            /* if we received all of the dtype_info and dataloop, copy it
+             * now and call the handler, otherwise set the iov and let the
+             * channel copy it */
+            if (data_len >= sizeof(MPIDI_RMA_dtype_info) + put_pkt->info.dataloop_size) {
+                /* copy all of dtype_info and dataloop */
+                MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
+                MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
+                            put_pkt->info.dataloop_size);
+
+                *buflen =
+                    sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) +
+                    put_pkt->info.dataloop_size;
+
+                /* All dtype data has been received, call req handler */
+                mpi_errno = MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete(vc, req, &complete);
+                MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                                     "**ch3|postrecv %s", "MPIDI_CH3_PKT_PUT");
+                if (complete) {
+                    *rreqp = NULL;
+                    goto fn_exit;
+                }
+            }
+            else {
+                req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) req->dev.dtype_info);
+                req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
+                req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
+                req->dev.iov[1].MPID_IOV_LEN = put_pkt->info.dataloop_size;
+                req->dev.iov_count = 2;
 
-            *buflen =
-                sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) + put_pkt->info.dataloop_size;
+                *buflen = sizeof(MPIDI_CH3_Pkt_t);
 
-            /* All dtype data has been received, call req handler */
-            mpi_errno = MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete(vc, req, &complete);
-            MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
-                                 "**ch3|postrecv %s", "MPIDI_CH3_PKT_PUT");
-            if (complete) {
-                *rreqp = NULL;
-                goto fn_exit;
+                req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete;
             }
-        }
-        else {
-            req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) req->dev.dtype_info);
-            req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
-            req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
-            req->dev.iov[1].MPID_IOV_LEN = put_pkt->info.dataloop_size;
-            req->dev.iov_count = 2;
 
-            *buflen = sizeof(MPIDI_CH3_Pkt_t);
-
-            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete;
         }
 
-    }
-
-    *rreqp = req;
+        *rreqp = req;
 
-    if (mpi_errno != MPI_SUCCESS) {
-        MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
-                      "**ch3|postrecv %s", "MPIDI_CH3_PKT_PUT");
-    }
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                          "**ch3|postrecv %s", "MPIDI_CH3_PKT_PUT");
+        }
     }
 
 
@@ -371,9 +376,9 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPIU_Assert(get_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(get_pkt->target_win_handle, win_ptr);
 
-    mpi_errno = check_piggyback_lock(win_ptr, vc, pkt,
-                                     buflen, &acquire_lock_fail, &req);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    mpi_errno = check_piggyback_lock(win_ptr, vc, pkt, buflen, &acquire_lock_fail, &req);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     if (acquire_lock_fail) {
         (*rreqp) = req;
@@ -389,7 +394,7 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
     /* here we increment the Active Target counter to guarantee the GET-like
-       operation are completed when counter reaches zero. */
+     * operation are completed when counter reaches zero. */
     win_ptr->at_completion_counter++;
 
     if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
@@ -429,9 +434,10 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
         if (get_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP) {
             MPIU_Assign_trunc(len, get_pkt->count * type_size, size_t);
-            void *src = (void*)(get_pkt->addr), *dest = (void*)(get_resp_pkt->info.data);
+            void *src = (void *) (get_pkt->addr), *dest = (void *) (get_resp_pkt->info.data);
             mpi_errno = immed_copy(src, dest, len);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
 
             iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
             iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
@@ -440,7 +446,7 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         else {
             iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_resp_pkt;
             iov[0].MPID_IOV_LEN = sizeof(*get_resp_pkt);
-            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)get_pkt->addr);
+            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) get_pkt->addr);
             iov[1].MPID_IOV_LEN = get_pkt->count * type_size;
             iovcnt = 2;
         }
@@ -493,7 +499,8 @@ int MPIDI_CH3_PktHandler_Get(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
                         get_pkt->info.dataloop_size);
 
             *buflen =
-                sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) + get_pkt->info.dataloop_size;
+                sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) +
+                get_pkt->info.dataloop_size;
 
             /* All dtype data has been received, call req handler */
             mpi_errno = MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete(vc, req, &complete);
@@ -551,9 +558,9 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPIU_Assert(accum_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(accum_pkt->target_win_handle, win_ptr);
 
-    mpi_errno = check_piggyback_lock(win_ptr, vc, pkt, buflen,
-                                     &acquire_lock_fail, &req);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    mpi_errno = check_piggyback_lock(win_ptr, vc, pkt, buflen, &acquire_lock_fail, &req);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     if (acquire_lock_fail) {
         (*rreqp) = req;
@@ -575,119 +582,121 @@ int MPIDI_CH3_PktHandler_Accumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         }
 
         /* trigger final action */
-        mpi_errno = finish_op_on_target(win_ptr, vc, FALSE /* has no response data */,
+        mpi_errno = finish_op_on_target(win_ptr, vc, FALSE /* has no response data */ ,
                                         accum_pkt->flags, accum_pkt->source_win_handle);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         *buflen = sizeof(MPIDI_CH3_Pkt_t);
         *rreqp = NULL;
     }
     else {
-    MPIU_Assert(pkt->type == MPIDI_CH3_PKT_ACCUMULATE);
+        MPIU_Assert(pkt->type == MPIDI_CH3_PKT_ACCUMULATE);
 
-    req = MPID_Request_create();
-    MPIU_Object_set_ref(req, 1);
-    *rreqp = req;
+        req = MPID_Request_create();
+        MPIU_Object_set_ref(req, 1);
+        *rreqp = req;
 
-    req->dev.user_count = accum_pkt->count;
-    req->dev.op = accum_pkt->op;
-    req->dev.real_user_buf = accum_pkt->addr;
-    req->dev.target_win_handle = accum_pkt->target_win_handle;
-    req->dev.source_win_handle = accum_pkt->source_win_handle;
-    req->dev.flags = accum_pkt->flags;
+        req->dev.user_count = accum_pkt->count;
+        req->dev.op = accum_pkt->op;
+        req->dev.real_user_buf = accum_pkt->addr;
+        req->dev.target_win_handle = accum_pkt->target_win_handle;
+        req->dev.source_win_handle = accum_pkt->source_win_handle;
+        req->dev.flags = accum_pkt->flags;
 
-    req->dev.resp_request_handle = MPI_REQUEST_NULL;
-    req->dev.OnFinal = MPIDI_CH3_ReqHandler_AccumRecvComplete;
+        req->dev.resp_request_handle = MPI_REQUEST_NULL;
+        req->dev.OnFinal = MPIDI_CH3_ReqHandler_AccumRecvComplete;
 
-    /* get start location of data and length of data */
-    data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
-    data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
+        /* get start location of data and length of data */
+        data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
+        data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
-    if (MPIR_DATATYPE_IS_PREDEFINED(accum_pkt->datatype)) {
-        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RESP);
-        req->dev.datatype = accum_pkt->datatype;
+        if (MPIR_DATATYPE_IS_PREDEFINED(accum_pkt->datatype)) {
+            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RESP);
+            req->dev.datatype = accum_pkt->datatype;
 
-        MPIR_Type_get_true_extent_impl(accum_pkt->datatype, &true_lb, &true_extent);
-        MPID_Datatype_get_extent_macro(accum_pkt->datatype, extent);
+            MPIR_Type_get_true_extent_impl(accum_pkt->datatype, &true_lb, &true_extent);
+            MPID_Datatype_get_extent_macro(accum_pkt->datatype, extent);
 
-        /* Predefined types should always have zero lb */
-        MPIU_Assert(true_lb == 0);
+            /* Predefined types should always have zero lb */
+            MPIU_Assert(true_lb == 0);
 
-        tmp_buf = MPIU_Malloc(accum_pkt->count * (MPIR_MAX(extent, true_extent)));
-        if (!tmp_buf) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                 accum_pkt->count * MPIR_MAX(extent, true_extent));
-        }
+            tmp_buf = MPIU_Malloc(accum_pkt->count * (MPIR_MAX(extent, true_extent)));
+            if (!tmp_buf) {
+                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
+                                     accum_pkt->count * MPIR_MAX(extent, true_extent));
+            }
 
-        req->dev.user_buf = tmp_buf;
+            req->dev.user_buf = tmp_buf;
 
-        MPID_Datatype_get_size_macro(accum_pkt->datatype, type_size);
-        req->dev.recv_data_sz = type_size * accum_pkt->count;
-        MPIU_Assert(req->dev.recv_data_sz > 0);
+            MPID_Datatype_get_size_macro(accum_pkt->datatype, type_size);
+            req->dev.recv_data_sz = type_size * accum_pkt->count;
+            MPIU_Assert(req->dev.recv_data_sz > 0);
 
-        mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
-        MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
-                             "**ch3|postrecv %s", "MPIDI_CH3_PKT_ACCUMULATE");
+            mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
+            MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                                 "**ch3|postrecv %s", "MPIDI_CH3_PKT_ACCUMULATE");
 
-        /* return the number of bytes processed in this function */
-        *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
+            /* return the number of bytes processed in this function */
+            *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
 
-        if (complete) {
-            mpi_errno = MPIDI_CH3_ReqHandler_AccumRecvComplete(vc, req, &complete);
-            if (mpi_errno)
-                MPIU_ERR_POP(mpi_errno);
             if (complete) {
-                *rreqp = NULL;
-                goto fn_exit;
+                mpi_errno = MPIDI_CH3_ReqHandler_AccumRecvComplete(vc, req, &complete);
+                if (mpi_errno)
+                    MPIU_ERR_POP(mpi_errno);
+                if (complete) {
+                    *rreqp = NULL;
+                    goto fn_exit;
+                }
             }
         }
-    }
-    else {
-        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RESP_DERIVED_DT);
-        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete;
-        req->dev.datatype = MPI_DATATYPE_NULL;
-
-        req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
-            MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
-        if (!req->dev.dtype_info) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                 "MPIDI_RMA_dtype_info");
-        }
-
-        req->dev.dataloop = MPIU_Malloc(accum_pkt->info.dataloop_size);
-        if (!req->dev.dataloop) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                 accum_pkt->info.dataloop_size);
-        }
-
-        if (data_len >= sizeof(MPIDI_RMA_dtype_info) + accum_pkt->info.dataloop_size) {
-            /* copy all of dtype_info and dataloop */
-            MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
-            MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
-                        accum_pkt->info.dataloop_size);
+        else {
+            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_ACCUM_RESP_DERIVED_DT);
+            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete;
+            req->dev.datatype = MPI_DATATYPE_NULL;
+
+            req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
+                MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
+            if (!req->dev.dtype_info) {
+                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
+                                     "MPIDI_RMA_dtype_info");
+            }
 
-            *buflen =
-                sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) + accum_pkt->info.dataloop_size;
+            req->dev.dataloop = MPIU_Malloc(accum_pkt->info.dataloop_size);
+            if (!req->dev.dataloop) {
+                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
+                                     accum_pkt->info.dataloop_size);
+            }
 
-            /* All dtype data has been received, call req handler */
-            mpi_errno = MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(vc, req, &complete);
-            MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
-                                 "**ch3|postrecv %s", "MPIDI_CH3_ACCUMULATE");
-            if (complete) {
-                *rreqp = NULL;
-                goto fn_exit;
+            if (data_len >= sizeof(MPIDI_RMA_dtype_info) + accum_pkt->info.dataloop_size) {
+                /* copy all of dtype_info and dataloop */
+                MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
+                MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
+                            accum_pkt->info.dataloop_size);
+
+                *buflen =
+                    sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) +
+                    accum_pkt->info.dataloop_size;
+
+                /* All dtype data has been received, call req handler */
+                mpi_errno = MPIDI_CH3_ReqHandler_AccumDerivedDTRecvComplete(vc, req, &complete);
+                MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                                     "**ch3|postrecv %s", "MPIDI_CH3_ACCUMULATE");
+                if (complete) {
+                    *rreqp = NULL;
+                    goto fn_exit;
+                }
+            }
+            else {
+                req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dtype_info;
+                req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
+                req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
+                req->dev.iov[1].MPID_IOV_LEN = accum_pkt->info.dataloop_size;
+                req->dev.iov_count = 2;
+                *buflen = sizeof(MPIDI_CH3_Pkt_t);
             }
-        }
-        else {
-            req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dtype_info;
-            req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
-            req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
-            req->dev.iov[1].MPID_IOV_LEN = accum_pkt->info.dataloop_size;
-            req->dev.iov_count = 2;
-            *buflen = sizeof(MPIDI_CH3_Pkt_t);
-        }
 
-    }
+        }
     }
 
     if (mpi_errno != MPI_SUCCESS) {
@@ -733,10 +742,9 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPIU_Assert(get_accum_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(get_accum_pkt->target_win_handle, win_ptr);
 
-    mpi_errno = check_piggyback_lock(win_ptr, vc, pkt,
-                                     buflen,
-                                     &acquire_lock_fail, &req);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    mpi_errno = check_piggyback_lock(win_ptr, vc, pkt, buflen, &acquire_lock_fail, &req);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     if (acquire_lock_fail) {
         (*rreqp) = req;
@@ -771,14 +779,14 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         if (!(get_accum_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP)) {
             tmp_buf = MPIU_Malloc(get_accum_pkt->count * type_size);
             if (!tmp_buf) {
-                MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %d",
+                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
                                      get_accum_pkt->count * type_size);
             }
             resp_req->dev.user_buf = tmp_buf;
         }
 
         /* here we increment the Active Target counter to guarantee the GET-like
-           operation are completed when counter reaches zero. */
+         * operation are completed when counter reaches zero. */
         win_ptr->at_completion_counter++;
 
         /* Calculate the length of reponse data, ensure that it fits into immed packet. */
@@ -827,7 +835,8 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         if (win_ptr->shm_allocated == TRUE)
             MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
 
         if (get_accum_resp_pkt->type == MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED) {
             iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
@@ -837,7 +846,7 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         else {
             iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) get_accum_resp_pkt;
             iov[0].MPID_IOV_LEN = sizeof(*get_accum_resp_pkt);
-            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *)resp_req->dev.user_buf);
+            iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) resp_req->dev.user_buf);
             iov[1].MPID_IOV_LEN = get_accum_pkt->count * type_size;
             iovcnt = 2;
         }
@@ -854,114 +863,115 @@ int MPIDI_CH3_PktHandler_GetAccumulate(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         /* --END ERROR HANDLING-- */
     }
     else {
-    MPIU_Assert(pkt->type == MPIDI_CH3_PKT_GET_ACCUM);
+        MPIU_Assert(pkt->type == MPIDI_CH3_PKT_GET_ACCUM);
 
-    req = MPID_Request_create();
-    MPIU_Object_set_ref(req, 1);
-    *rreqp = req;
+        req = MPID_Request_create();
+        MPIU_Object_set_ref(req, 1);
+        *rreqp = req;
 
-    req->dev.user_count = get_accum_pkt->count;
-    req->dev.op = get_accum_pkt->op;
-    req->dev.real_user_buf = get_accum_pkt->addr;
-    req->dev.target_win_handle = get_accum_pkt->target_win_handle;
-    req->dev.flags = get_accum_pkt->flags;
+        req->dev.user_count = get_accum_pkt->count;
+        req->dev.op = get_accum_pkt->op;
+        req->dev.real_user_buf = get_accum_pkt->addr;
+        req->dev.target_win_handle = get_accum_pkt->target_win_handle;
+        req->dev.flags = get_accum_pkt->flags;
 
-    req->dev.resp_request_handle = get_accum_pkt->request_handle;
-    req->dev.OnFinal = MPIDI_CH3_ReqHandler_GaccumRecvComplete;
+        req->dev.resp_request_handle = get_accum_pkt->request_handle;
+        req->dev.OnFinal = MPIDI_CH3_ReqHandler_GaccumRecvComplete;
 
-    /* get start location of data and length of data */
-    data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
-    data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
+        /* get start location of data and length of data */
+        data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
+        data_buf = (char *) pkt + sizeof(MPIDI_CH3_Pkt_t);
 
-    if (MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype)) {
-        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
-        req->dev.datatype = get_accum_pkt->datatype;
+        if (MPIR_DATATYPE_IS_PREDEFINED(get_accum_pkt->datatype)) {
+            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP);
+            req->dev.datatype = get_accum_pkt->datatype;
 
-        MPIR_Type_get_true_extent_impl(get_accum_pkt->datatype, &true_lb, &true_extent);
-        MPID_Datatype_get_extent_macro(get_accum_pkt->datatype, extent);
+            MPIR_Type_get_true_extent_impl(get_accum_pkt->datatype, &true_lb, &true_extent);
+            MPID_Datatype_get_extent_macro(get_accum_pkt->datatype, extent);
 
-        /* Predefined types should always have zero lb */
-        MPIU_Assert(true_lb == 0);
+            /* Predefined types should always have zero lb */
+            MPIU_Assert(true_lb == 0);
 
-        tmp_buf = MPIU_Malloc(get_accum_pkt->count * (MPIR_MAX(extent, true_extent)));
-        if (!tmp_buf) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                 get_accum_pkt->count * MPIR_MAX(extent, true_extent));
-        }
+            tmp_buf = MPIU_Malloc(get_accum_pkt->count * (MPIR_MAX(extent, true_extent)));
+            if (!tmp_buf) {
+                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
+                                     get_accum_pkt->count * MPIR_MAX(extent, true_extent));
+            }
 
-        req->dev.user_buf = tmp_buf;
+            req->dev.user_buf = tmp_buf;
 
-        req->dev.recv_data_sz = type_size * get_accum_pkt->count;
-        MPIU_Assert(req->dev.recv_data_sz > 0);
+            req->dev.recv_data_sz = type_size * get_accum_pkt->count;
+            MPIU_Assert(req->dev.recv_data_sz > 0);
 
-        mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
-        MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
-                             "**ch3|postrecv %s", "MPIDI_CH3_PKT_ACCUMULATE");
+            mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
+            MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                                 "**ch3|postrecv %s", "MPIDI_CH3_PKT_ACCUMULATE");
 
-        /* return the number of bytes processed in this function */
-        *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
+            /* return the number of bytes processed in this function */
+            *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
 
-        if (complete) {
-            mpi_errno = MPIDI_CH3_ReqHandler_GaccumRecvComplete(vc, req, &complete);
-            if (mpi_errno)
-                MPIU_ERR_POP(mpi_errno);
             if (complete) {
-                *rreqp = NULL;
-                goto fn_exit;
+                mpi_errno = MPIDI_CH3_ReqHandler_GaccumRecvComplete(vc, req, &complete);
+                if (mpi_errno)
+                    MPIU_ERR_POP(mpi_errno);
+                if (complete) {
+                    *rreqp = NULL;
+                    goto fn_exit;
+                }
             }
         }
-    }
-    else {
-        MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP_DERIVED_DT);
-        req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete;
-        req->dev.datatype = MPI_DATATYPE_NULL;
-
-        req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
-            MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
-        if (!req->dev.dtype_info) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                 "MPIDI_RMA_dtype_info");
-        }
-
-        req->dev.dataloop = MPIU_Malloc(get_accum_pkt->info.dataloop_size);
-        if (!req->dev.dataloop) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                 get_accum_pkt->info.dataloop_size);
-        }
-
-        if (data_len >= sizeof(MPIDI_RMA_dtype_info) + get_accum_pkt->info.dataloop_size) {
-            /* copy all of dtype_info and dataloop */
-            MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
-            MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
-                        get_accum_pkt->info.dataloop_size);
+        else {
+            MPIDI_Request_set_type(req, MPIDI_REQUEST_TYPE_GET_ACCUM_RESP_DERIVED_DT);
+            req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete;
+            req->dev.datatype = MPI_DATATYPE_NULL;
+
+            req->dev.dtype_info = (MPIDI_RMA_dtype_info *)
+                MPIU_Malloc(sizeof(MPIDI_RMA_dtype_info));
+            if (!req->dev.dtype_info) {
+                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
+                                     "MPIDI_RMA_dtype_info");
+            }
 
-            *buflen =
-                sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) + get_accum_pkt->info.dataloop_size;
+            req->dev.dataloop = MPIU_Malloc(get_accum_pkt->info.dataloop_size);
+            if (!req->dev.dataloop) {
+                MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
+                                     get_accum_pkt->info.dataloop_size);
+            }
 
-            /* All dtype data has been received, call req handler */
-            mpi_errno = MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(vc, req, &complete);
-            MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
-                                 "**ch3|postrecv %s", "MPIDI_CH3_ACCUMULATE");
-            if (complete) {
-                *rreqp = NULL;
-                goto fn_exit;
+            if (data_len >= sizeof(MPIDI_RMA_dtype_info) + get_accum_pkt->info.dataloop_size) {
+                /* copy all of dtype_info and dataloop */
+                MPIU_Memcpy(req->dev.dtype_info, data_buf, sizeof(MPIDI_RMA_dtype_info));
+                MPIU_Memcpy(req->dev.dataloop, data_buf + sizeof(MPIDI_RMA_dtype_info),
+                            get_accum_pkt->info.dataloop_size);
+
+                *buflen =
+                    sizeof(MPIDI_CH3_Pkt_t) + sizeof(MPIDI_RMA_dtype_info) +
+                    get_accum_pkt->info.dataloop_size;
+
+                /* All dtype data has been received, call req handler */
+                mpi_errno = MPIDI_CH3_ReqHandler_GaccumDerivedDTRecvComplete(vc, req, &complete);
+                MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                                     "**ch3|postrecv %s", "MPIDI_CH3_ACCUMULATE");
+                if (complete) {
+                    *rreqp = NULL;
+                    goto fn_exit;
+                }
+            }
+            else {
+                req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dtype_info;
+                req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
+                req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
+                req->dev.iov[1].MPID_IOV_LEN = get_accum_pkt->info.dataloop_size;
+                req->dev.iov_count = 2;
+                *buflen = sizeof(MPIDI_CH3_Pkt_t);
             }
-        }
-        else {
-            req->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dtype_info;
-            req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_RMA_dtype_info);
-            req->dev.iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) req->dev.dataloop;
-            req->dev.iov[1].MPID_IOV_LEN = get_accum_pkt->info.dataloop_size;
-            req->dev.iov_count = 2;
-            *buflen = sizeof(MPIDI_CH3_Pkt_t);
-        }
 
-    }
+        }
 
-    if (mpi_errno != MPI_SUCCESS) {
-        MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
-                             "**ch3|postrecv %s", "MPIDI_CH3_PKT_ACCUMULATE");
-    }
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                                 "**ch3|postrecv %s", "MPIDI_CH3_PKT_ACCUMULATE");
+        }
     }
 
   fn_exit:
@@ -1001,11 +1011,11 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     MPIU_Assert(cas_pkt->target_win_handle != MPI_WIN_NULL);
     MPID_Win_get_ptr(cas_pkt->target_win_handle, win_ptr);
 
-    mpi_errno = check_piggyback_lock(win_ptr, vc, pkt, buflen,
-                                     &acquire_lock_fail, &rreq);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
-    MPIU_Assert(rreq == NULL); /* CAS should not have request because all data
-                                  can fit in packet header */
+    mpi_errno = check_piggyback_lock(win_ptr, vc, pkt, buflen, &acquire_lock_fail, &rreq);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
+    MPIU_Assert(rreq == NULL);  /* CAS should not have request because all data
+                                 * can fit in packet header */
 
     if (acquire_lock_fail) {
         (*rreqp) = rreq;
@@ -1055,13 +1065,13 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     if (req != NULL) {
         if (!MPID_Request_is_complete(req)) {
             /* sending process is not completed, set proper OnDataAvail
-               (it is initialized to NULL by lower layer) */
+             * (it is initialized to NULL by lower layer) */
             req->dev.target_win_handle = cas_pkt->target_win_handle;
             req->dev.flags = cas_pkt->flags;
             req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_CASSendComplete;
 
             /* here we increment the Active Target counter to guarantee the GET-like
-               operation are completed when counter reaches zero. */
+             * operation are completed when counter reaches zero. */
             win_ptr->at_completion_counter++;
 
             MPID_Request_release(req);
@@ -1071,9 +1081,10 @@ int MPIDI_CH3_PktHandler_CAS(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
             MPID_Request_release(req);
     }
 
-    mpi_errno = finish_op_on_target(win_ptr, vc, TRUE /* has response data */,
+    mpi_errno = finish_op_on_target(win_ptr, vc, TRUE /* has response data */ ,
                                     cas_pkt->flags, MPI_WIN_NULL);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
   fn_exit:
     MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_cas);
@@ -1112,17 +1123,18 @@ int MPIDI_CH3_PktHandler_CASResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
 
     /* decrement ack_counter on this target */
     if (cas_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED) {
-        mpi_errno = handle_lock_ack(win_ptr, target_rank,
-                                          cas_resp_pkt->flags);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = handle_lock_ack(win_ptr, target_rank, cas_resp_pkt->flags);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
 
-        mpi_errno = adjust_op_piggybacked_with_lock(win_ptr, target_rank,
-                                                    cas_resp_pkt->flags);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = adjust_op_piggybacked_with_lock(win_ptr, target_rank, cas_resp_pkt->flags);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
     }
     if (cas_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
         mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     MPID_Datatype_get_size_macro(req->dev.datatype, len);
@@ -1168,12 +1180,12 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPID_Win_get_ptr(fop_pkt->target_win_handle, win_ptr);
 
-    mpi_errno = check_piggyback_lock(win_ptr, vc, pkt, buflen,
-                                     &acquire_lock_fail, &rreq);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    mpi_errno = check_piggyback_lock(win_ptr, vc, pkt, buflen, &acquire_lock_fail, &rreq);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
-    MPIU_Assert(rreq == NULL); /* FOP should not have request because all data
-                                  can fit in packet header */
+    MPIU_Assert(rreq == NULL);  /* FOP should not have request because all data
+                                 * can fit in packet header */
     if (acquire_lock_fail) {
         (*rreqp) = rreq;
         goto fn_exit;
@@ -1186,71 +1198,73 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     if (pkt->type == MPIDI_CH3_PKT_FOP_IMMED) {
 
-    MPIU_Assert(fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP);
+        MPIU_Assert(fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP);
 
-    MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP_IMMED);
-    fop_resp_pkt->request_handle = fop_pkt->request_handle;
-    fop_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
-    fop_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
-    if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
-        fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE)
-        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
-    if ((fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
-        (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
-        fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
+        MPIDI_Pkt_init(fop_resp_pkt, MPIDI_CH3_PKT_FOP_RESP_IMMED);
+        fop_resp_pkt->request_handle = fop_pkt->request_handle;
+        fop_resp_pkt->target_rank = win_ptr->comm_ptr->rank;
+        fop_resp_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
+        if (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_SHARED ||
+            fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_EXCLUSIVE)
+            fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED;
+        if ((fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH) ||
+            (fop_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK))
+            fop_resp_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK;
 
-    if (win_ptr->shm_allocated == TRUE)
-        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-
-    /* copy data to resp pkt header */
-    void *src = fop_pkt->addr, *dest = fop_resp_pkt->info.data;
-    mpi_errno = immed_copy(src, dest, type_size);
-    if (mpi_errno != MPI_SUCCESS) {
         if (win_ptr->shm_allocated == TRUE)
-            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-        MPIU_ERR_POP(mpi_errno);
-    }
+            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 
-    /* Apply the op */
-    if (fop_pkt->op != MPI_NO_OP) {
-        mpi_errno = do_accumulate_op(fop_pkt->info.data, fop_pkt->addr,
-                                     1, fop_pkt->datatype, fop_pkt->op);
-    }
+        /* copy data to resp pkt header */
+        void *src = fop_pkt->addr, *dest = fop_resp_pkt->info.data;
+        mpi_errno = immed_copy(src, dest, type_size);
+        if (mpi_errno != MPI_SUCCESS) {
+            if (win_ptr->shm_allocated == TRUE)
+                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+            MPIU_ERR_POP(mpi_errno);
+        }
 
-    if (win_ptr->shm_allocated == TRUE)
-        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+        /* Apply the op */
+        if (fop_pkt->op != MPI_NO_OP) {
+            mpi_errno = do_accumulate_op(fop_pkt->info.data, fop_pkt->addr,
+                                         1, fop_pkt->datatype, fop_pkt->op);
+        }
 
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (win_ptr->shm_allocated == TRUE)
+            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 
-    /* send back the original data */
-    MPIU_THREAD_CS_ENTER(CH3COMM,vc);
-    mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_resp_pkt, sizeof(*fop_resp_pkt), &resp_req);
-    MPIU_THREAD_CS_EXIT(CH3COMM,vc);
-    MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
-    if (resp_req != NULL) {
-        if (!MPID_Request_is_complete(resp_req)) {
-            /* sending process is not completed, set proper OnDataAvail
-               (it is initialized to NULL by lower layer) */
-            resp_req->dev.target_win_handle = fop_pkt->target_win_handle;
-            resp_req->dev.flags = fop_pkt->flags;
-            resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_FOPSendComplete;
+        /* send back the original data */
+        MPIU_THREAD_CS_ENTER(CH3COMM, vc);
+        mpi_errno = MPIDI_CH3_iStartMsg(vc, fop_resp_pkt, sizeof(*fop_resp_pkt), &resp_req);
+        MPIU_THREAD_CS_EXIT(CH3COMM, vc);
+        MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rmamsg");
 
-            /* here we increment the Active Target counter to guarantee the GET-like
-               operation are completed when counter reaches zero. */
-            win_ptr->at_completion_counter++;
+        if (resp_req != NULL) {
+            if (!MPID_Request_is_complete(resp_req)) {
+                /* sending process is not completed, set proper OnDataAvail
+                 * (it is initialized to NULL by lower layer) */
+                resp_req->dev.target_win_handle = fop_pkt->target_win_handle;
+                resp_req->dev.flags = fop_pkt->flags;
+                resp_req->dev.OnDataAvail = MPIDI_CH3_ReqHandler_FOPSendComplete;
 
-            MPID_Request_release(resp_req);
-            goto fn_exit;
-        }
-        else {
-            MPID_Request_release(resp_req);
+                /* here we increment the Active Target counter to guarantee the GET-like
+                 * operation are completed when counter reaches zero. */
+                win_ptr->at_completion_counter++;
+
+                MPID_Request_release(resp_req);
+                goto fn_exit;
+            }
+            else {
+                MPID_Request_release(resp_req);
+            }
         }
-    }
 
-    mpi_errno = finish_op_on_target(win_ptr, vc, TRUE /* has response data */,
-                                    fop_pkt->flags, MPI_WIN_NULL);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = finish_op_on_target(win_ptr, vc, TRUE /* has response data */ ,
+                                        fop_pkt->flags, MPI_WIN_NULL);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
     else {
         MPIU_Assert(pkt->type == MPIDI_CH3_PKT_FOP);
@@ -1285,8 +1299,7 @@ int MPIDI_CH3_PktHandler_FOP(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
         req->dev.user_buf = MPIU_Malloc(extent);
         if (!req->dev.user_buf) {
-            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d",
-                                 extent);
+            MPIU_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %d", extent);
         }
 
         req->dev.recv_data_sz = type_size;
@@ -1351,17 +1364,18 @@ int MPIDI_CH3_PktHandler_FOPResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
 
     /* decrement ack_counter */
     if (fop_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED) {
-        mpi_errno = handle_lock_ack(win_ptr, target_rank,
-                                          fop_resp_pkt->flags);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = handle_lock_ack(win_ptr, target_rank, fop_resp_pkt->flags);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
 
-        mpi_errno = adjust_op_piggybacked_with_lock(win_ptr, target_rank,
-                                                    fop_resp_pkt->flags);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = adjust_op_piggybacked_with_lock(win_ptr, target_rank, fop_resp_pkt->flags);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
     }
     if (fop_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
         mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
@@ -1384,16 +1398,16 @@ int MPIDI_CH3_PktHandler_FOPResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
         MPIU_Assert(fop_resp_pkt->type == MPIDI_CH3_PKT_FOP_RESP);
 
         mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
-        MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv", "**ch3|postrecv %s",
-                             "MPIDI_CH3_PKT_FOP_RESP");
+        MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                             "**ch3|postrecv %s", "MPIDI_CH3_PKT_FOP_RESP");
 
         /* return the number of bytes processed in this function */
         *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
     }
 
     if (complete) {
-    MPIDI_CH3U_Request_complete(req);
-    *rreqp = NULL;
+        MPIDI_CH3U_Request_complete(req);
+        *rreqp = NULL;
     }
 
   fn_exit:
@@ -1436,17 +1450,19 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     /* decrement ack_counter on target */
     if (get_accum_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED) {
-        mpi_errno = handle_lock_ack(win_ptr, target_rank,
-                                          get_accum_resp_pkt->flags);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = handle_lock_ack(win_ptr, target_rank, get_accum_resp_pkt->flags);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
 
         mpi_errno = adjust_op_piggybacked_with_lock(win_ptr, target_rank,
                                                     get_accum_resp_pkt->flags);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
     }
     if (get_accum_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
         mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
@@ -1464,23 +1480,24 @@ int MPIDI_CH3_PktHandler_Get_AccumResp(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         complete = 1;
     }
     else {
-    MPIU_Assert(pkt->type == MPIDI_CH3_PKT_GET_ACCUM_RESP);
+        MPIU_Assert(pkt->type == MPIDI_CH3_PKT_GET_ACCUM_RESP);
 
-    mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
-    MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
-                         "**ch3|postrecv %s", "MPIDI_CH3_PKT_GET_ACCUM_RESP");
+        mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
+        MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                             "**ch3|postrecv %s", "MPIDI_CH3_PKT_GET_ACCUM_RESP");
 
-    /* return the number of bytes processed in this function */
-    *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
+        /* return the number of bytes processed in this function */
+        *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
     }
     if (complete) {
         /* Request-based RMA defines final actions for completing user request. */
-        int (*reqFn)(MPIDI_VC_t *, MPID_Request *, int *);
+        int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *);
         reqFn = req->dev.OnFinal;
 
         if (reqFn) {
             mpi_errno = reqFn(vc, req, &complete);
-        } else {
+        }
+        else {
             MPIDI_CH3U_Request_complete(req);
         }
         *rreqp = NULL;
@@ -1530,13 +1547,15 @@ int MPIDI_CH3_PktHandler_Lock(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         mpi_errno = MPIDI_CH3I_Send_lock_ack_pkt(vc, win_ptr, MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED,
                                                  lock_pkt->source_win_handle,
                                                  lock_pkt->request_handle);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     else {
         MPID_Request *req = NULL;
         mpi_errno = enqueue_lock_origin(win_ptr, vc, pkt, buflen, &req);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
         MPIU_Assert(req == NULL);
     }
 
@@ -1578,17 +1597,18 @@ int MPIDI_CH3_PktHandler_GetResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
 
     /* decrement ack_counter on target */
     if (get_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED) {
-        mpi_errno = handle_lock_ack(win_ptr, target_rank,
-                                          get_resp_pkt->flags);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = handle_lock_ack(win_ptr, target_rank, get_resp_pkt->flags);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
 
-        mpi_errno = adjust_op_piggybacked_with_lock(win_ptr, target_rank,
-                                                    get_resp_pkt->flags);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = adjust_op_piggybacked_with_lock(win_ptr, target_rank, get_resp_pkt->flags);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
     }
     if (get_resp_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
         mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     data_len = *buflen - sizeof(MPIDI_CH3_Pkt_t);
@@ -1607,24 +1627,25 @@ int MPIDI_CH3_PktHandler_GetResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
         complete = 1;
     }
     else {
-    MPIU_Assert(get_resp_pkt->type == MPIDI_CH3_PKT_GET_RESP);
+        MPIU_Assert(get_resp_pkt->type == MPIDI_CH3_PKT_GET_RESP);
 
-    mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
-    MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv", "**ch3|postrecv %s",
-                         "MPIDI_CH3_PKT_GET_RESP");
+        mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete);
+        MPIU_ERR_CHKANDJUMP1(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|postrecv",
+                             "**ch3|postrecv %s", "MPIDI_CH3_PKT_GET_RESP");
 
-    /* return the number of bytes processed in this function */
-    *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
+        /* return the number of bytes processed in this function */
+        *buflen = data_len + sizeof(MPIDI_CH3_Pkt_t);
     }
 
     if (complete) {
         /* Request-based RMA defines final actions for completing user request. */
-        int (*reqFn)(MPIDI_VC_t *, MPID_Request *, int *);
+        int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *);
         reqFn = req->dev.OnFinal;
 
         if (reqFn) {
             mpi_errno = reqFn(vc, req, &complete);
-        } else {
+        }
+        else {
             MPIDI_CH3U_Request_complete(req);
         }
 
@@ -1644,7 +1665,7 @@ int MPIDI_CH3_PktHandler_GetResp(MPIDI_VC_t * vc ATTRIBUTE((unused)),
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_CH3_PktHandler_LockAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
-                                     MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
+                                 MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
 {
     MPIDI_CH3_Pkt_lock_ack_t *lock_ack_pkt = &pkt->lock_ack;
     MPID_Win *win_ptr = NULL;
@@ -1661,7 +1682,7 @@ int MPIDI_CH3_PktHandler_LockAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
 
     if (lock_ack_pkt->source_win_handle != MPI_WIN_NULL) {
-    MPID_Win_get_ptr(lock_ack_pkt->source_win_handle, win_ptr);
+        MPID_Win_get_ptr(lock_ack_pkt->source_win_handle, win_ptr);
     }
     else {
         MPIU_Assert(lock_ack_pkt->request_handle != MPI_REQUEST_NULL);
@@ -1672,18 +1693,18 @@ int MPIDI_CH3_PktHandler_LockAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         MPID_Win_get_ptr(req_ptr->dev.source_win_handle, win_ptr);
     }
 
-    mpi_errno = handle_lock_ack(win_ptr, target_rank,
-                                      lock_ack_pkt->flags);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    mpi_errno = handle_lock_ack(win_ptr, target_rank, lock_ack_pkt->flags);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
 
     *rreqp = NULL;
     MPIDI_CH3_Progress_signal_completion();
 
     MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_lock_ack);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKACK);
- fn_exit:
+  fn_exit:
     return MPI_SUCCESS;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -1706,7 +1727,7 @@ int MPIDI_CH3_PktHandler_LockOpAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
     *buflen = sizeof(MPIDI_CH3_Pkt_t);
 
     if (lock_op_ack_pkt->source_win_handle != MPI_WIN_NULL) {
-    MPID_Win_get_ptr(lock_op_ack_pkt->source_win_handle, win_ptr);
+        MPID_Win_get_ptr(lock_op_ack_pkt->source_win_handle, win_ptr);
     }
     else {
         MPIU_Assert(lock_op_ack_pkt->request_handle != MPI_REQUEST_NULL);
@@ -1717,27 +1738,28 @@ int MPIDI_CH3_PktHandler_LockOpAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
         MPID_Win_get_ptr(req_ptr->dev.source_win_handle, win_ptr);
     }
 
-    mpi_errno = handle_lock_ack(win_ptr, target_rank,
-                                      lock_op_ack_pkt->flags);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    mpi_errno = handle_lock_ack(win_ptr, target_rank, lock_op_ack_pkt->flags);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
 
-    mpi_errno = adjust_op_piggybacked_with_lock(win_ptr, target_rank,
-                                                lock_op_ack_pkt->flags);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    mpi_errno = adjust_op_piggybacked_with_lock(win_ptr, target_rank, lock_op_ack_pkt->flags);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     if (flags & MPIDI_CH3_PKT_FLAG_RMA_FLUSH_ACK) {
         MPIU_Assert(flags & MPIDI_CH3_PKT_FLAG_RMA_LOCK_GRANTED);
         mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     *rreqp = NULL;
     MPIDI_CH3_Progress_signal_completion();
 
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_LOCKOPACK);
- fn_exit:
+  fn_exit:
     return MPI_SUCCESS;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -1746,7 +1768,7 @@ int MPIDI_CH3_PktHandler_LockOpAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
 int MPIDI_CH3_PktHandler_FlushAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
-                                   MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
+                                  MPIDI_msg_sz_t * buflen, MPID_Request ** rreqp)
 {
     MPIDI_CH3_Pkt_flush_ack_t *flush_ack_pkt = &pkt->flush_ack;
     MPID_Win *win_ptr = NULL;
@@ -1766,16 +1788,17 @@ int MPIDI_CH3_PktHandler_FlushAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     /* decrement ack_counter on target */
     mpi_errno = MPIDI_CH3I_RMA_Handle_flush_ack(win_ptr, target_rank);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
 
     *rreqp = NULL;
     MPIDI_CH3_Progress_signal_completion();
 
     MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_flush_ack);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_FLUSHACK);
- fn_exit:
+  fn_exit:
     return MPI_SUCCESS;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -1806,11 +1829,11 @@ int MPIDI_CH3_PktHandler_DecrAtCnt(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     *rreqp = NULL;
     MPIDI_CH3_Progress_signal_completion();
 
- fn_exit:
+  fn_exit:
     MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_decr_at_cnt);
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PKTHANDLER_DECRATCNT);
     return mpi_errno;
-   fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -1841,9 +1864,9 @@ int MPIDI_CH3_PktHandler_Unlock(MPIDI_VC_t * vc ATTRIBUTE((unused)),
     MPIU_ERR_CHKANDJUMP(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**ch3|rma_msg");
 
     if (!(unlock_pkt->flags & MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK)) {
-        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr,
-                                                  unlock_pkt->source_win_handle);
-        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, unlock_pkt->source_win_handle);
+        if (mpi_errno)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     MPIDI_CH3_Progress_signal_completion();
@@ -1881,9 +1904,9 @@ int MPIDI_CH3_PktHandler_Flush(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt,
 
     MPID_Win_get_ptr(flush_pkt->target_win_handle, win_ptr);
 
-    mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr,
-                                              flush_pkt->source_win_handle);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    mpi_errno = MPIDI_CH3I_Send_flush_ack_pkt(vc, win_ptr, flush_pkt->source_win_handle);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
   fn_exit:
     MPIR_T_PVAR_TIMER_END(RMA, rma_rmapkt_flush);
diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index a28392a..94de1c1 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -319,8 +319,8 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
         }
         else {
             /* It is possible that there is a IBARRIER in MPI_WIN_FENCE with
-               MODE_NOPRECEDE not being completed, we let the progress engine
-               to delete its request when it is completed. */
+             * MODE_NOPRECEDE not being completed, we let the progress engine
+             * to delete its request when it is completed. */
             if (win_ptr->fence_sync_req != MPI_REQUEST_NULL) {
                 MPID_Request *req_ptr;
                 MPID_Request_get_ptr(win_ptr->fence_sync_req, req_ptr);
@@ -333,16 +333,18 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
                 MPID_Comm *node_comm_ptr = win_ptr->comm_ptr->node_comm;
 
                 mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
-                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
                 MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
                 /* Mark that we triggered the progress engine
-                   in this function call. */
+                 * in this function call. */
                 progress_engine_triggered = 1;
             }
 
             mpi_errno = MPIR_Ibarrier_impl(win_ptr->comm_ptr, &(win_ptr->fence_sync_req));
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
 
             /* Set window access state properly. */
             win_ptr->states.access_state = MPIDI_RMA_FENCE_ISSUED;
@@ -359,7 +361,7 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
                 MPIU_ERR_POP(mpi_errno);
 
             /* Mark that we triggered the progress engine
-               in this function call. */
+             * in this function call. */
             progress_engine_triggered = 1;
         }
     }
@@ -381,35 +383,37 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
 
     /* Issue out all operations. */
     mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     /* Wait for remote completion. */
     do {
-        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr,
-                                                   &local_completed,
-                                                   &remote_completed);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed, &remote_completed);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
         if (!remote_completed) {
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
 
             /* Mark that we triggered the progress engine
-               in this function call. */
+             * in this function call. */
             progress_engine_triggered = 1;
         }
     } while (!remote_completed);
 
     /* Cleanup all targets on window. */
     mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     mpi_errno = MPIR_Barrier_impl(win_ptr->comm_ptr, &errflag);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
     MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
     /* Mark that we triggered the progress engine
-       in this function call. */
+     * in this function call. */
     progress_engine_triggered = 1;
 
     /* Set window access state properly. */
@@ -420,7 +424,7 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
         win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED;
     }
 
- finish_fence:
+  finish_fence:
     /* Make sure that all targets are freed. */
     MPIU_Assert(win_ptr->non_empty_slots == 0);
 
@@ -438,12 +442,12 @@ int MPIDI_Win_fence(int assert, MPID_Win * win_ptr)
     if (!(assert & MPI_MODE_NOPRECEDE)) {
         if (!progress_engine_triggered) {
             /* In some cases (e.g. target is myself, or process on SHM),
-               this function call does not go through the progress engine.
-               Therefore, it is possible that this process never process
-               events coming from other processes. This may cause deadlock in
-               applications where the program execution on this process depends
-               on the happening of events from other processes. Here we poke
-               the progress engine once to avoid such issue.  */
+             * this function call does not go through the progress engine.
+             * Therefore, it is possible that this process never process
+             * events coming from other processes. This may cause deadlock in
+             * applications where the program execution on this process depends
+             * on the happening of events from other processes. Here we poke
+             * the progress engine once to avoid such issue.  */
             mpi_errno = poke_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
@@ -479,9 +483,9 @@ int MPIDI_Win_post(MPID_Group * post_grp_ptr, int assert, MPID_Win * win_ptr)
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_POST);
 
     /* Note that here we cannot distinguish if this exposure epoch is overlapped
-       with an exposure epoch of FENCE (which is not allowed), since FENCE may be
-       ended up with not unsetting the window state. We can only detect if this
-       exposure epoch is overlapped with another exposure epoch of PSCW. */
+     * with an exposure epoch of FENCE (which is not allowed), since FENCE may be
+     * ended up with not unsetting the window state. We can only detect if this
+     * exposure epoch is overlapped with another exposure epoch of PSCW. */
     MPIU_ERR_CHKANDJUMP(win_ptr->states.exposure_state != MPIDI_RMA_NONE,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
@@ -511,7 +515,8 @@ int MPIDI_Win_post(MPID_Group * post_grp_ptr, int assert, MPID_Win * win_ptr)
         MPIU_CHKLMEM_MALLOC(post_ranks_in_win_grp, int *,
                             post_grp_size * sizeof(int), mpi_errno, "post_ranks_in_win_grp");
         mpi_errno = fill_ranks_in_win_grp(win_ptr, post_grp_ptr, post_ranks_in_win_grp);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
 
         MPIU_CHKLMEM_MALLOC(req, MPI_Request *, post_grp_size * sizeof(MPI_Request),
                             mpi_errno, "req");
@@ -526,7 +531,8 @@ int MPIDI_Win_post(MPID_Group * post_grp_ptr, int assert, MPID_Win * win_ptr)
                 MPID_Request *req_ptr;
                 mpi_errno = MPID_Isend(&i, 0, MPI_INT, dst, SYNC_POST_TAG, win_comm_ptr,
                                        MPID_CONTEXT_INTRA_PT2PT, &req_ptr);
-                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
                 req[i] = req_ptr->handle;
             }
             else {
@@ -575,10 +581,10 @@ int MPIDI_Win_start(MPID_Group * group_ptr, int assert, MPID_Win * win_ptr)
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_START);
 
     /* Note that here we cannot distinguish if this access epoch is overlapped
-       with an access epoch of FENCE (which is not allowed), since FENCE may be
-       ended up with not unsetting the window state. We can only detect if this
-       access epoch is overlapped with another access epoch of PSCW or Passive
-       Target. */
+     * with an access epoch of FENCE (which is not allowed), since FENCE may be
+     * ended up with not unsetting the window state. We can only detect if this
+     * access epoch is overlapped with another access epoch of PSCW or Passive
+     * Target. */
     MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_NONE &&
                         win_ptr->states.access_state != MPIDI_RMA_FENCE_ISSUED &&
                         win_ptr->states.access_state != MPIDI_RMA_FENCE_GRANTED,
@@ -591,7 +597,8 @@ int MPIDI_Win_start(MPID_Group * group_ptr, int assert, MPID_Win * win_ptr)
                         mpi_errno, "win_ptr->start_ranks_in_win_grp");
 
     mpi_errno = fill_ranks_in_win_grp(win_ptr, group_ptr, win_ptr->start_ranks_in_win_grp);
-    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno)
+        MPIU_ERR_POP(mpi_errno);
 
     if ((assert & MPI_MODE_NOCHECK) == 0) {
         int i, intra_cnt;
@@ -610,8 +617,7 @@ int MPIDI_Win_start(MPID_Group * group_ptr, int assert, MPID_Win * win_ptr)
         if (win_ptr->shm_allocated == TRUE) {
             int node_comm_size = comm_ptr->node_comm->local_size;
             MPIU_CHKLMEM_MALLOC(intra_start_req, MPI_Request *,
-                                node_comm_size * sizeof(MPI_Request),
-                                mpi_errno, "intra_start_req");
+                                node_comm_size * sizeof(MPI_Request), mpi_errno, "intra_start_req");
             MPIU_CHKLMEM_MALLOC(intra_start_status, MPI_Status *,
                                 node_comm_size * sizeof(MPI_Status),
                                 mpi_errno, "intra_start_status");
@@ -629,10 +635,10 @@ int MPIDI_Win_start(MPID_Group * group_ptr, int assert, MPID_Win * win_ptr)
 
                 mpi_errno = MPID_Irecv(NULL, 0, MPI_INT, src, SYNC_POST_TAG,
                                        comm_ptr, MPID_CONTEXT_INTRA_PT2PT, &req_ptr);
-                if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+                if (mpi_errno != MPI_SUCCESS)
+                    MPIU_ERR_POP(mpi_errno);
 
-                if (win_ptr->shm_allocated == TRUE &&
-                    orig_vc->node_id == target_vc->node_id) {
+                if (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id) {
                     intra_start_req[intra_cnt++] = req_ptr->handle;
                     win_ptr->start_req[i] = MPI_REQUEST_NULL;
                 }
@@ -663,7 +669,7 @@ int MPIDI_Win_start(MPID_Group * group_ptr, int assert, MPID_Win * win_ptr)
         }
     }
 
- finish_start:
+  finish_start:
     /* Set window access state properly. */
     win_ptr->states.access_state = MPIDI_RMA_PSCW_ISSUED;
     num_active_issued_win++;
@@ -678,11 +684,11 @@ int MPIDI_Win_start(MPID_Group * group_ptr, int assert, MPID_Win * win_ptr)
         OPA_read_write_barrier();
     }
 
- fn_exit:
+  fn_exit:
     MPIU_CHKLMEM_FREEALL();
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_START);
     return mpi_errno;
- fn_fail:
+  fn_fail:
     MPIU_CHKPMEM_REAP();
     goto fn_exit;
 }
@@ -723,7 +729,7 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
                 MPIU_ERR_POP(mpi_errno);
 
             /* Mark that we triggered the progress engine
-               in this function call. */
+             * in this function call. */
             progress_engine_triggered = 1;
         }
     }
@@ -756,35 +762,38 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
         else {
             /* FIXME: do we need to wait for remote completion? */
             mpi_errno = send_decr_at_cnt_msg(dst, win_ptr);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
         }
     }
 
     /* issue out all operations */
     mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     /* wait until all slots are empty */
     do {
-        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
-                                                   &remote_completed);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed, &remote_completed);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
         if (!remote_completed) {
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
 
             /* Mark that we triggered the progress engine
-               in this function call. */
+             * in this function call. */
             progress_engine_triggered = 1;
         }
     } while (!remote_completed);
 
     /* Cleanup all targets on this window. */
     mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
- finish_complete:
+  finish_complete:
     /* Set window access state properly. */
     win_ptr->states.access_state = MPIDI_RMA_NONE;
 
@@ -803,12 +812,12 @@ int MPIDI_Win_complete(MPID_Win * win_ptr)
 
     if (!progress_engine_triggered) {
         /* In some cases (e.g. target is myself, or process on SHM),
-           this function call does not go through the progress engine.
-           Therefore, it is possible that this process never process
-           events coming from other processes. This may cause deadlock in
-           applications where the program execution on this process depends
-           on the happening of events from other processes. Here we poke
-           the progress engine once to avoid such issue.  */
+         * this function call does not go through the progress engine.
+         * Therefore, it is possible that this process never process
+         * events coming from other processes. This may cause deadlock in
+         * applications where the program execution on this process depends
+         * on the happening of events from other processes. Here we poke
+         * the progress engine once to avoid such issue.  */
         mpi_errno = poke_progress_engine();
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
@@ -847,22 +856,22 @@ int MPIDI_Win_wait(MPID_Win * win_ptr)
             MPIU_ERR_POP(mpi_errno);
 
         /* Mark that we triggered the progress engine
-           in this function call. */
+         * in this function call. */
         progress_engine_triggered = 1;
     }
 
- finish_wait:
+  finish_wait:
     /* Set window exposure state properly. */
     win_ptr->states.exposure_state = MPIDI_RMA_NONE;
 
     if (!progress_engine_triggered) {
         /* In some cases (e.g. target is myself, or process on SHM),
-           this function call does not go through the progress engine.
-           Therefore, it is possible that this process never process
-           events coming from other processes. This may cause deadlock in
-           applications where the program execution on this process depends
-           on the happening of events from other processes. Here we poke
-           the progress engine once to avoid such issue.  */
+         * this function call does not go through the progress engine.
+         * Therefore, it is possible that this process never process
+         * events coming from other processes. This may cause deadlock in
+         * applications where the program execution on this process depends
+         * on the happening of events from other processes. Here we poke
+         * the progress engine once to avoid such issue.  */
         mpi_errno = poke_progress_engine();
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
@@ -899,7 +908,7 @@ int MPIDI_Win_test(MPID_Win * win_ptr, int *flag)
 
     mpi_errno = MPID_Progress_test();
     if (mpi_errno != MPI_SUCCESS) {
-	MPIU_ERR_POP(mpi_errno);
+        MPIU_ERR_POP(mpi_errno);
     }
 
     *flag = (win_ptr->at_completion_counter) ? 0 : 1;
@@ -946,10 +955,10 @@ int MPIDI_Win_lock(int lock_type, int dest, int assert, MPID_Win * win_ptr)
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_LOCK);
 
     /* Note that here we cannot distinguish if this access epoch is overlapped
-       with an access epoch of FENCE (which is not allowed), since FENCE may be
-       ended up with not unsetting the window state. We can only detect if this
-       access epoch is overlapped with another access epoch of PSCW or Passive
-       Target. */
+     * with an access epoch of FENCE (which is not allowed), since FENCE may be
+     * ended up with not unsetting the window state. We can only detect if this
+     * access epoch is overlapped with another access epoch of PSCW or Passive
+     * Target. */
     if (win_ptr->lock_epoch_count == 0) {
         MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_NONE &&
                             win_ptr->states.access_state != MPIDI_RMA_FENCE_ISSUED &&
@@ -964,7 +973,8 @@ int MPIDI_Win_lock(int lock_type, int dest, int assert, MPID_Win * win_ptr)
     if (dest != MPI_PROC_NULL) {
         /* check if we lock the same target window more than once. */
         mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, dest, &target);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
         MPIU_ERR_CHKANDJUMP(target != NULL, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
     }
 
@@ -989,10 +999,11 @@ int MPIDI_Win_lock(int lock_type, int dest, int assert, MPID_Win * win_ptr)
 
     /* Create a new target. */
     mpi_errno = MPIDI_CH3I_Win_create_target(win_ptr, dest, &target);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     /* Store lock_state (CALLED/ISSUED/GRANTED), lock_type (SHARED/EXCLUSIVE),
-       lock_mode (MODE_NOCHECK). */
+     * lock_mode (MODE_NOCHECK). */
     if (assert & MPI_MODE_NOCHECK)
         target->access_state = MPIDI_RMA_LOCK_GRANTED;
     else
@@ -1001,7 +1012,7 @@ int MPIDI_Win_lock(int lock_type, int dest, int assert, MPID_Win * win_ptr)
     target->lock_mode = assert;
 
     /* If Destination is myself or process on SHM, acquire the lock,
-       wait until lock is granted. */
+     * wait until lock is granted. */
     if (!(assert & MPI_MODE_NOCHECK) && (dest == rank || shm_target)) {
         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest, &made_progress);
         if (mpi_errno != MPI_SUCCESS)
@@ -1014,7 +1025,7 @@ int MPIDI_Win_lock(int lock_type, int dest, int assert, MPID_Win * win_ptr)
         }
     }
 
- finish_lock:
+  finish_lock:
     if (win_ptr->lock_epoch_count == 1) {
         /* BEGINNING synchronization: the following counter should be zero. */
         MPIU_Assert(win_ptr->accumulated_ops_cnt == 0);
@@ -1038,7 +1049,7 @@ int MPIDI_Win_lock(int lock_type, int dest, int assert, MPID_Win * win_ptr)
 #define FUNCNAME MPIDI_Win_unlock
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_Win_unlock(int dest, MPID_Win *win_ptr)
+int MPIDI_Win_unlock(int dest, MPID_Win * win_ptr)
 {
     int made_progress = 0;
     int local_completed = 0, remote_completed = 0;
@@ -1084,16 +1095,14 @@ int MPIDI_Win_unlock(int dest, MPID_Win *win_ptr)
     }
 
     /* Issue out all operations. */
-    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest,
-                                                    &made_progress);
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest, &made_progress);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
     /* Wait for remote completion. */
     do {
         mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, target,
-                                                      &local_completed,
-                                                      &remote_completed);
+                                                      &local_completed, &remote_completed);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
         if (!remote_completed) {
@@ -1102,12 +1111,12 @@ int MPIDI_Win_unlock(int dest, MPID_Win *win_ptr)
                 MPIU_ERR_POP(mpi_errno);
 
             /* Mark that we triggered the progress engine
-               in this function call. */
+             * in this function call. */
             progress_engine_triggered = 1;
         }
     } while (!remote_completed);
 
- finish_unlock:
+  finish_unlock:
     win_ptr->lock_epoch_count--;
     if (win_ptr->lock_epoch_count == 0) {
         /* Set window access state properly. */
@@ -1125,17 +1134,18 @@ int MPIDI_Win_unlock(int dest, MPID_Win *win_ptr)
 
         /* Cleanup the target. */
         mpi_errno = MPIDI_CH3I_RMA_Cleanup_single_target(win_ptr, target);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
     }
 
     if (!progress_engine_triggered) {
         /* In some cases (e.g. target is myself, or process on SHM),
-           this function call does not go through the progress engine.
-           Therefore, it is possible that this process never process
-           events coming from other processes. This may cause deadlock in
-           applications where the program execution on this process depends
-           on the happening of events from other processes. Here we poke
-           the progress engine once to avoid such issue.  */
+         * this function call does not go through the progress engine.
+         * Therefore, it is possible that this process never process
+         * events coming from other processes. This may cause deadlock in
+         * applications where the program execution on this process depends
+         * on the happening of events from other processes. Here we poke
+         * the progress engine once to avoid such issue.  */
         mpi_errno = poke_progress_engine();
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
@@ -1155,7 +1165,7 @@ int MPIDI_Win_unlock(int dest, MPID_Win *win_ptr)
 #define FUNCNAME MPIDI_Win_flush
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_Win_flush(int dest, MPID_Win *win_ptr)
+int MPIDI_Win_flush(int dest, MPID_Win * win_ptr)
 {
     int made_progress = 0;
     int local_completed = 0, remote_completed = 0;
@@ -1206,16 +1216,14 @@ int MPIDI_Win_flush(int dest, MPID_Win *win_ptr)
     }
 
     /* Issue out all operations. */
-    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest,
-                                                    &made_progress);
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest, &made_progress);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
     /* Wait for remote completion. */
     do {
         mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, target,
-                                                      &local_completed,
-                                                      &remote_completed);
+                                                      &local_completed, &remote_completed);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
         if (!remote_completed) {
@@ -1224,12 +1232,12 @@ int MPIDI_Win_flush(int dest, MPID_Win *win_ptr)
                 MPIU_ERR_POP(mpi_errno);
 
             /* Mark that we triggered the progress engine
-               in this function call. */
+             * in this function call. */
             progress_engine_triggered = 1;
         }
     } while (!remote_completed);
 
- finish_flush:
+  finish_flush:
     if (target != NULL) {
         /* ENDING synchronization: correctly decrement the following counters. */
         win_ptr->accumulated_ops_cnt -= target->accumulated_ops_cnt;
@@ -1238,12 +1246,12 @@ int MPIDI_Win_flush(int dest, MPID_Win *win_ptr)
 
     if (!progress_engine_triggered) {
         /* In some cases (e.g. target is myself, or process on SHM),
-           this function call does not go through the progress engine.
-           Therefore, it is possible that this process never process
-           events coming from other processes. This may cause deadlock in
-           applications where the program execution on this process depends
-           on the happening of events from other processes. Here we poke
-           the progress engine once to avoid such issue.  */
+         * this function call does not go through the progress engine.
+         * Therefore, it is possible that this process never process
+         * events coming from other processes. This may cause deadlock in
+         * applications where the program execution on this process depends
+         * on the happening of events from other processes. Here we poke
+         * the progress engine once to avoid such issue.  */
         mpi_errno = poke_progress_engine();
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
@@ -1320,16 +1328,14 @@ int MPIDI_Win_flush_local(int dest, MPID_Win * win_ptr)
     }
 
     /* Issue out all operations. */
-    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest,
-                                                    &made_progress);
+    mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, dest, &made_progress);
     if (mpi_errno != MPI_SUCCESS)
         MPIU_ERR_POP(mpi_errno);
 
     /* Wait for local completion. */
     do {
         mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, target,
-                                                      &local_completed,
-                                                      &remote_completed);
+                                                      &local_completed, &remote_completed);
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
         if ((target->disable_flush_local && !remote_completed) ||
@@ -1339,13 +1345,13 @@ int MPIDI_Win_flush_local(int dest, MPID_Win * win_ptr)
                 MPIU_ERR_POP(mpi_errno);
 
             /* Mark that we triggered the progress engine
-               in this function call. */
+             * in this function call. */
             progress_engine_triggered = 1;
         }
     } while ((target->disable_flush_local && !remote_completed) ||
              (!target->disable_flush_local && !local_completed));
 
- finish_flush_local:
+  finish_flush_local:
     if (target != NULL) {
         /* reset disable_flush_local flag in target to 0 */
         target->disable_flush_local = 0;
@@ -1357,12 +1363,12 @@ int MPIDI_Win_flush_local(int dest, MPID_Win * win_ptr)
 
     if (!progress_engine_triggered) {
         /* In some cases (e.g. target is myself, or process on SHM),
-           this function call does not go through the progress engine.
-           Therefore, it is possible that this process never process
-           events coming from other processes. This may cause deadlock in
-           applications where the program execution on this process depends
-           on the happening of events from other processes. Here we poke
-           the progress engine once to avoid such issue.  */
+         * this function call does not go through the progress engine.
+         * Therefore, it is possible that this process never process
+         * events coming from other processes. This may cause deadlock in
+         * applications where the program execution on this process depends
+         * on the happening of events from other processes. Here we poke
+         * the progress engine once to avoid such issue.  */
         mpi_errno = poke_progress_engine();
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
@@ -1391,10 +1397,10 @@ int MPIDI_Win_lock_all(int assert, MPID_Win * win_ptr)
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_LOCK_ALL);
 
     /* Note that here we cannot distinguish if this access epoch is overlapped
-       with an access epoch of FENCE (which is not allowed), since FENCE may be
-       ended up with not unsetting the window state. We can only detect if this
-       access epoch is overlapped with another access epoch of PSCW or Passive
-       Target. */
+     * with an access epoch of FENCE (which is not allowed), since FENCE may be
+     * ended up with not unsetting the window state. We can only detect if this
+     * access epoch is overlapped with another access epoch of PSCW or Passive
+     * Target. */
     MPIU_ERR_CHKANDJUMP(win_ptr->states.access_state != MPIDI_RMA_NONE &&
                         win_ptr->states.access_state != MPIDI_RMA_FENCE_ISSUED &&
                         win_ptr->states.access_state != MPIDI_RMA_FENCE_GRANTED,
@@ -1412,7 +1418,7 @@ int MPIDI_Win_lock_all(int assert, MPID_Win * win_ptr)
     MPIU_Assert(win_ptr->outstanding_locks == 0);
 
     /* Acquire the lock on myself and the lock on processes on SHM.
-       No need to create a target for them. */
+     * No need to create a target for them. */
     if (!(win_ptr->lock_all_assert & MPI_MODE_NOCHECK)) {
         win_ptr->outstanding_locks++;
         mpi_errno = acquire_local_lock(win_ptr, MPI_LOCK_SHARED);
@@ -1443,7 +1449,7 @@ int MPIDI_Win_lock_all(int assert, MPID_Win * win_ptr)
         }
     }
 
- finish_lock_all:
+  finish_lock_all:
     /* BEGINNING synchronization: the following counter should be zero. */
     MPIU_Assert(win_ptr->accumulated_ops_cnt == 0);
 
@@ -1471,7 +1477,7 @@ int MPIDI_Win_lock_all(int assert, MPID_Win * win_ptr)
 int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
 {
     int i, made_progress = 0;
-    int local_completed = 0,remote_completed = 0;
+    int local_completed = 0, remote_completed = 0;
     int rank = win_ptr->comm_ptr->rank;
     MPIDI_RMA_Target_t *curr_target = NULL;
     int progress_engine_triggered = 0;
@@ -1501,7 +1507,8 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
             MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
             MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
             for (i = 0; i < win_ptr->comm_ptr->local_size; i++) {
-                if (i == rank) continue;
+                if (i == rank)
+                    continue;
                 MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
                 if (orig_vc->node_id == target_vc->node_id) {
                     mpi_errno = send_unlock_msg(i, win_ptr, MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK);
@@ -1575,25 +1582,26 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
 
     /* Wait for remote completion. */
     do {
-        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
-                                                   &remote_completed);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed, &remote_completed);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
         if (!remote_completed) {
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
 
             /* Mark that we triggered the progress engine
-               in this function call. */
+             * in this function call. */
             progress_engine_triggered = 1;
         }
     } while (!remote_completed);
 
     /* Cleanup all targets on this window. */
     mpi_errno = MPIDI_CH3I_RMA_Cleanup_targets_win(win_ptr);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
- finish_unlock_all:
+  finish_unlock_all:
     /* Set window access state properly. */
     win_ptr->states.access_state = MPIDI_RMA_NONE;
     num_passive_win--;
@@ -1612,12 +1620,12 @@ int MPIDI_Win_unlock_all(MPID_Win * win_ptr)
 
     if (!progress_engine_triggered) {
         /* In some cases (e.g. target is myself, or process on SHM),
-           this function call does not go through the progress engine.
-           Therefore, it is possible that this process never process
-           events coming from other processes. This may cause deadlock in
-           applications where the program execution on this process depends
-           on the happening of events from other processes. Here we poke
-           the progress engine once to avoid such issue.  */
+         * this function call does not go through the progress engine.
+         * Therefore, it is possible that this process never process
+         * events coming from other processes. This may cause deadlock in
+         * applications where the program execution on this process depends
+         * on the happening of events from other processes. Here we poke
+         * the progress engine once to avoid such issue.  */
         mpi_errno = poke_progress_engine();
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
@@ -1683,21 +1691,21 @@ int MPIDI_Win_flush_all(MPID_Win * win_ptr)
 
     /* Wait for remote completion. */
     do {
-        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed,
-                                                   &remote_completed);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_win(win_ptr, &local_completed, &remote_completed);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
         if (!remote_completed) {
             mpi_errno = wait_progress_engine();
             if (mpi_errno != MPI_SUCCESS)
                 MPIU_ERR_POP(mpi_errno);
 
             /* Mark that we triggered the progress engine
-               in this function call. */
+             * in this function call. */
             progress_engine_triggered = 1;
         }
     } while (!remote_completed);
 
- finish_flush_all:
+  finish_flush_all:
     /* ENDING synchronization: correctly decrement the following counter. */
     win_ptr->accumulated_ops_cnt = 0;
 
@@ -1705,12 +1713,12 @@ int MPIDI_Win_flush_all(MPID_Win * win_ptr)
 
     if (!progress_engine_triggered) {
         /* In some cases (e.g. target is myself, or process on SHM),
-           this function call does not go through the progress engine.
-           Therefore, it is possible that this process never process
-           events coming from other processes. This may cause deadlock in
-           applications where the program execution on this process depends
-           on the happening of events from other processes. Here we poke
-           the progress engine once to avoid such issue.  */
+         * this function call does not go through the progress engine.
+         * Therefore, it is possible that this process never process
+         * events coming from other processes. This may cause deadlock in
+         * applications where the program execution on this process depends
+         * on the happening of events from other processes. Here we poke
+         * the progress engine once to avoid such issue.  */
         mpi_errno = poke_progress_engine();
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
@@ -1782,7 +1790,8 @@ int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
 
     /* issue out all operations. */
     mpi_errno = MPIDI_CH3I_RMA_Make_progress_win(win_ptr, &made_progress);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     /* wait for remote completion for those targets that disable flush_local,
      * and wait for local completion for other targets */
@@ -1791,8 +1800,7 @@ int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
             curr_target = win_ptr->slots[i].target_list;
             while (curr_target != NULL) {
                 mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target,
-                                                              &local_completed,
-                                                              &remote_completed);
+                                                              &local_completed, &remote_completed);
                 if (mpi_errno != MPI_SUCCESS)
                     MPIU_ERR_POP(mpi_errno);
 
@@ -1817,7 +1825,7 @@ int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
                 MPIU_ERR_POP(mpi_errno);
 
             /* Mark that we triggered the progress engine
-               in this function call. */
+             * in this function call. */
             progress_engine_triggered = 1;
         }
     } while (remote_completed_cnt < disable_flush_local_cnt ||
@@ -1838,12 +1846,12 @@ int MPIDI_Win_flush_local_all(MPID_Win * win_ptr)
 
     if (!progress_engine_triggered) {
         /* In some cases (e.g. target is myself, or process on SHM),
-           this function call does not go through the progress engine.
-           Therefore, it is possible that this process never process
-           events coming from other processes. This may cause deadlock in
-           applications where the program execution on this process depends
-           on the happening of events from other processes. Here we poke
-           the progress engine once to avoid such issue.  */
+         * this function call does not go through the progress engine.
+         * Therefore, it is possible that this process never process
+         * events coming from other processes. This may cause deadlock in
+         * applications where the program execution on this process depends
+         * on the happening of events from other processes. Here we poke
+         * the progress engine once to avoid such issue.  */
         mpi_errno = poke_progress_engine();
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
diff --git a/src/mpid/ch3/src/ch3u_win_fns.c b/src/mpid/ch3/src/ch3u_win_fns.c
index 166915b..a2aa76a 100644
--- a/src/mpid/ch3/src/ch3u_win_fns.c
+++ b/src/mpid/ch3/src/ch3u_win_fns.c
@@ -14,17 +14,17 @@ MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_wincreate_allgather);
 #define FUNCNAME MPIDI_Win_fns_init
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns)
+int MPIDI_Win_fns_init(MPIDI_CH3U_Win_fns_t * win_fns)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FNS_INIT);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FNS_INIT);
 
-    win_fns->create             = MPIDI_CH3U_Win_create;
-    win_fns->allocate           = MPIDI_CH3U_Win_allocate;
-    win_fns->allocate_shared    = MPIDI_CH3U_Win_allocate;
-    win_fns->create_dynamic     = MPIDI_CH3U_Win_create_dynamic;
+    win_fns->create = MPIDI_CH3U_Win_create;
+    win_fns->allocate = MPIDI_CH3U_Win_allocate;
+    win_fns->allocate_shared = MPIDI_CH3U_Win_allocate;
+    win_fns->create_dynamic = MPIDI_CH3U_Win_create_dynamic;
 
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FNS_INIT);
 
@@ -36,10 +36,10 @@ int MPIDI_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns)
 #define FUNCNAME MPIDI_CH3U_Win_create_gather
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3U_Win_create_gather( void *base, MPI_Aint size, int disp_unit,
-                                  MPID_Info *info, MPID_Comm *comm_ptr, MPID_Win **win_ptr )
+int MPIDI_CH3U_Win_create_gather(void *base, MPI_Aint size, int disp_unit,
+                                 MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr)
 {
-    int mpi_errno=MPI_SUCCESS, i, k, comm_size, rank;
+    int mpi_errno = MPI_SUCCESS, i, k, comm_size, rank;
     MPI_Aint *tmp_buf;
     mpir_errflag_t errflag = MPIR_ERR_NONE;
     MPIU_CHKPMEM_DECL(5);
@@ -49,60 +49,58 @@ int MPIDI_CH3U_Win_create_gather( void *base, MPI_Aint size, int disp_unit,
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE_GATHER);
 
     comm_size = (*win_ptr)->comm_ptr->local_size;
-    rank      = (*win_ptr)->comm_ptr->rank;
+    rank = (*win_ptr)->comm_ptr->rank;
 
     MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
     /* allocate memory for the base addresses, disp_units, and
-       completion counters of all processes */
+     * completion counters of all processes */
     MPIU_CHKPMEM_MALLOC((*win_ptr)->base_addrs, void **,
-                        comm_size*sizeof(void *),
-                        mpi_errno, "(*win_ptr)->base_addrs");
+                        comm_size * sizeof(void *), mpi_errno, "(*win_ptr)->base_addrs");
 
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->sizes, MPI_Aint *, comm_size*sizeof(MPI_Aint),
+    MPIU_CHKPMEM_MALLOC((*win_ptr)->sizes, MPI_Aint *, comm_size * sizeof(MPI_Aint),
                         mpi_errno, "(*win_ptr)->sizes");
 
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->disp_units, int *, comm_size*sizeof(int),
+    MPIU_CHKPMEM_MALLOC((*win_ptr)->disp_units, int *, comm_size * sizeof(int),
                         mpi_errno, "(*win_ptr)->disp_units");
 
     MPIU_CHKPMEM_MALLOC((*win_ptr)->all_win_handles, MPI_Win *,
-                        comm_size*sizeof(MPI_Win),
-                        mpi_errno, "(*win_ptr)->all_win_handles");
+                        comm_size * sizeof(MPI_Win), mpi_errno, "(*win_ptr)->all_win_handles");
 
     /* get the addresses of the windows, window objects, and completion
-       counters of all processes.  allocate temp. buffer for communication */
-    MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4*comm_size*sizeof(MPI_Aint),
+     * counters of all processes.  allocate temp. buffer for communication */
+    MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint),
                         mpi_errno, "tmp_buf");
 
     /* FIXME: This needs to be fixed for heterogeneous systems */
     /* FIXME: If we wanted to validate the transfer as within range at the
-       origin, we'd also need the window size. */
-    tmp_buf[4*rank]   = MPIU_PtrToAint(base);
-    tmp_buf[4*rank+1] = size;
-    tmp_buf[4*rank+2] = (MPI_Aint) disp_unit;
-    tmp_buf[4*rank+3] = (MPI_Aint) (*win_ptr)->handle;
+     * origin, we'd also need the window size. */
+    tmp_buf[4 * rank] = MPIU_PtrToAint(base);
+    tmp_buf[4 * rank + 1] = size;
+    tmp_buf[4 * rank + 2] = (MPI_Aint) disp_unit;
+    tmp_buf[4 * rank + 3] = (MPI_Aint) (*win_ptr)->handle;
 
     mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
-                                    tmp_buf, 4, MPI_AINT,
-                                    (*win_ptr)->comm_ptr, &errflag);
+                                    tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag);
     MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
-    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+    if (mpi_errno) {
+        MPIU_ERR_POP(mpi_errno);
+    }
     MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
     k = 0;
-    for (i=0; i<comm_size; i++)
-    {
+    for (i = 0; i < comm_size; i++) {
         (*win_ptr)->base_addrs[i] = MPIU_AintToPtr(tmp_buf[k++]);
         (*win_ptr)->sizes[i] = tmp_buf[k++];
         (*win_ptr)->disp_units[i] = (int) tmp_buf[k++];
         (*win_ptr)->all_win_handles[i] = (MPI_Win) tmp_buf[k++];
     }
 
-fn_exit:
+  fn_exit:
     MPIU_CHKLMEM_FREEALL();
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_CREATE_GATHER);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
-fn_fail:
+  fn_fail:
     MPIU_CHKPMEM_REAP();
     goto fn_exit;
     /* --END ERROR HANDLING-- */
@@ -113,8 +111,8 @@ fn_fail:
 #define FUNCNAME MPIDI_CH3U_Win_create
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3U_Win_create(void *base, MPI_Aint size, int disp_unit, MPID_Info *info,
-                         MPID_Comm *comm_ptr, MPID_Win **win_ptr )
+int MPIDI_CH3U_Win_create(void *base, MPI_Aint size, int disp_unit, MPID_Info * info,
+                          MPID_Comm * comm_ptr, MPID_Win ** win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_CREATE);
@@ -122,22 +120,24 @@ int MPIDI_CH3U_Win_create(void *base, MPI_Aint size, int disp_unit, MPID_Info *i
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE);
 
     mpi_errno = MPIDI_CH3U_Win_create_gather(base, size, disp_unit, info, comm_ptr, win_ptr);
-    if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIU_ERR_POP(mpi_errno);
+    }
 
-    if ((*win_ptr)->info_args.alloc_shm == TRUE
-            && MPIDI_CH3U_Win_fns.detect_shm != NULL) {
+    if ((*win_ptr)->info_args.alloc_shm == TRUE && MPIDI_CH3U_Win_fns.detect_shm != NULL) {
         /* Detect if shared buffers are specified for the processes in the
          * current node. If so, enable shm RMA.*/
         mpi_errno = MPIDI_CH3U_Win_fns.detect_shm(win_ptr);
-        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+        if (mpi_errno != MPI_SUCCESS)
+            MPIU_ERR_POP(mpi_errno);
         goto fn_exit;
     }
 
-fn_exit:
+  fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_CREATE);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
-fn_fail:
+  fn_fail:
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -147,7 +147,7 @@ fn_fail:
 #define FUNCNAME MPIDI_CH3U_Win_create_dynamic
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3U_Win_create_dynamic(MPID_Info *info, MPID_Comm *comm_ptr, MPID_Win **win_ptr )
+int MPIDI_CH3U_Win_create_dynamic(MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_CREATE_DYNAMIC);
@@ -155,13 +155,15 @@ int MPIDI_CH3U_Win_create_dynamic(MPID_Info *info, MPID_Comm *comm_ptr, MPID_Win
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE_DYNAMIC);
 
     mpi_errno = MPIDI_CH3U_Win_create_gather(MPI_BOTTOM, 0, 1, info, comm_ptr, win_ptr);
-    if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIU_ERR_POP(mpi_errno);
+    }
 
-fn_exit:
+  fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_CREATE_DYNAMIC);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
-fn_fail:
+  fn_fail:
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -171,7 +173,7 @@ fn_fail:
 #define FUNCNAME MPIDI_Win_attach
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_Win_attach(MPID_Win *win, void *base, MPI_Aint size)
+int MPIDI_Win_attach(MPID_Win * win, void *base, MPI_Aint size)
 {
     int mpi_errno = MPI_SUCCESS;
 
@@ -180,11 +182,11 @@ int MPIDI_Win_attach(MPID_Win *win, void *base, MPI_Aint size)
 
     /* no op, all of memory is exposed */
 
- fn_exit:
+  fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_ATTACH);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
- fn_fail:
+  fn_fail:
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -194,7 +196,7 @@ int MPIDI_Win_attach(MPID_Win *win, void *base, MPI_Aint size)
 #define FUNCNAME MPIDI_Win_detach
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_Win_detach(MPID_Win *win, const void *base)
+int MPIDI_Win_detach(MPID_Win * win, const void *base)
 {
     int mpi_errno = MPI_SUCCESS;
 
@@ -203,11 +205,11 @@ int MPIDI_Win_detach(MPID_Win *win, const void *base)
 
     /* no op, all of memory is exposed */
 
- fn_exit:
+  fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_DETACH);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
- fn_fail:
+  fn_fail:
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -217,8 +219,8 @@ int MPIDI_Win_detach(MPID_Win *win, const void *base)
 #define FUNCNAME MPIDI_CH3U_Win_allocate
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3U_Win_allocate(MPI_Aint size, int disp_unit, MPID_Info *info,
-                            MPID_Comm *comm_ptr, void *baseptr, MPID_Win **win_ptr)
+int MPIDI_CH3U_Win_allocate(MPI_Aint size, int disp_unit, MPID_Info * info,
+                            MPID_Comm * comm_ptr, void *baseptr, MPID_Win ** win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE);
@@ -227,19 +229,22 @@ int MPIDI_CH3U_Win_allocate(MPI_Aint size, int disp_unit, MPID_Info *info,
 
     if ((*win_ptr)->info_args.alloc_shm == TRUE) {
         if (MPIDI_CH3U_Win_fns.allocate_shm != NULL) {
-            mpi_errno = MPIDI_CH3U_Win_fns.allocate_shm(size, disp_unit, info, comm_ptr, baseptr, win_ptr);
-            if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+            mpi_errno =
+                MPIDI_CH3U_Win_fns.allocate_shm(size, disp_unit, info, comm_ptr, baseptr, win_ptr);
+            if (mpi_errno != MPI_SUCCESS)
+                MPIU_ERR_POP(mpi_errno);
             goto fn_exit;
         }
     }
 
     mpi_errno = MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, baseptr, win_ptr);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
- fn_exit:
+  fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE);
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -248,8 +253,8 @@ int MPIDI_CH3U_Win_allocate(MPI_Aint size, int disp_unit, MPID_Info *info,
 #define FUNCNAME MPIDI_CH3U_Win_allocate_no_shm
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_CH3U_Win_allocate_no_shm(MPI_Aint size, int disp_unit, MPID_Info *info,
-                                   MPID_Comm *comm_ptr, void *baseptr, MPID_Win **win_ptr )
+int MPIDI_CH3U_Win_allocate_no_shm(MPI_Aint size, int disp_unit, MPID_Info * info,
+                                   MPID_Comm * comm_ptr, void *baseptr, MPID_Win ** win_ptr)
 {
     void **base_pp = (void **) baseptr;
     int mpi_errno = MPI_SUCCESS;
@@ -260,22 +265,26 @@ int MPIDI_CH3U_Win_allocate_no_shm(MPI_Aint size, int disp_unit, MPID_Info *info
 
     if (size > 0) {
         MPIU_CHKPMEM_MALLOC(*base_pp, void *, size, mpi_errno, "(*win_ptr)->base");
-    } else if (size == 0) {
+    }
+    else if (size == 0) {
         *base_pp = NULL;
-    } else {
+    }
+    else {
         MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_SIZE, "**rmasize");
     }
 
     (*win_ptr)->base = *base_pp;
 
     mpi_errno = MPIDI_CH3U_Win_create_gather(*base_pp, size, disp_unit, info, comm_ptr, win_ptr);
-    if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIU_ERR_POP(mpi_errno);
+    }
 
-fn_exit:
+  fn_exit:
     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE_NO_SHM);
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
-fn_fail:
+  fn_fail:
     MPIU_CHKPMEM_REAP();
     goto fn_exit;
     /* --END ERROR HANDLING-- */
@@ -286,7 +295,7 @@ fn_fail:
 #define FUNCNAME MPIDI_Win_set_info
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_Win_set_info(MPID_Win *win, MPID_Info *info)
+int MPIDI_Win_set_info(MPID_Win * win, MPID_Info * info)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_SET_INFO);
@@ -299,9 +308,8 @@ int MPIDI_Win_set_info(MPID_Win *win, MPID_Info *info)
 
     if (info != NULL) {
         int info_flag = 0;
-        char info_value[MPI_MAX_INFO_VAL+1];
-        MPIR_Info_get_impl(info, "no_locks", MPI_MAX_INFO_VAL,
-                           info_value, &info_flag);
+        char info_value[MPI_MAX_INFO_VAL + 1];
+        MPIR_Info_get_impl(info, "no_locks", MPI_MAX_INFO_VAL, info_value, &info_flag);
         if (info_flag) {
             if (!strncmp(info_value, "true", strlen("true")))
                 win->info_args.no_locks = 1;
@@ -322,9 +330,8 @@ int MPIDI_Win_set_info(MPID_Win *win, MPID_Info *info)
 
     if (info != NULL) {
         int info_flag = 0;
-        char info_value[MPI_MAX_INFO_VAL+1];
-        MPIR_Info_get_impl(info, "alloc_shm", MPI_MAX_INFO_VAL,
-                           info_value, &info_flag);
+        char info_value[MPI_MAX_INFO_VAL + 1];
+        MPIR_Info_get_impl(info, "alloc_shm", MPI_MAX_INFO_VAL, info_value, &info_flag);
         if (info_flag) {
             if (!strncmp(info_value, "true", sizeof("true")))
                 win->info_args.alloc_shm = TRUE;
@@ -344,7 +351,7 @@ int MPIDI_Win_set_info(MPID_Win *win, MPID_Info *info)
         win->info_args.alloc_shared_noncontig = 1;
     if (info != NULL) {
         int info_flag = 0;
-        char info_value[MPI_MAX_INFO_VAL+1];
+        char info_value[MPI_MAX_INFO_VAL + 1];
         MPIR_Info_get_impl(info, "alloc_shared_noncontig", MPI_MAX_INFO_VAL,
                            info_value, &info_flag);
         if (info_flag) {
@@ -355,10 +362,10 @@ int MPIDI_Win_set_info(MPID_Win *win, MPID_Info *info)
         }
     }
 
- fn_exit:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_WIN_SET_INFO);
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
 
@@ -367,7 +374,7 @@ int MPIDI_Win_set_info(MPID_Win *win, MPID_Info *info)
 #define FUNCNAME MPIDI_Win_get_info
 #undef FCNAME
 #define FCNAME MPIDI_QUOTE(FUNCNAME)
-int MPIDI_Win_get_info(MPID_Win *win, MPID_Info **info_used)
+int MPIDI_Win_get_info(MPID_Win * win, MPID_Info ** info_used)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_GET_INFO);
@@ -376,7 +383,9 @@ int MPIDI_Win_get_info(MPID_Win *win, MPID_Info **info_used)
 
     /* Allocate an empty info object */
     mpi_errno = MPIU_Info_alloc(info_used);
-    if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIU_ERR_POP(mpi_errno);
+    }
 
     /* Populate the predefined info keys */
     if (win->info_args.no_locks)
@@ -384,23 +393,27 @@ int MPIDI_Win_get_info(MPID_Win *win, MPID_Info **info_used)
     else
         mpi_errno = MPIR_Info_set_impl(*info_used, "no_locks", "false");
 
-    if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIU_ERR_POP(mpi_errno);
+    }
 
     {
 #define BUFSIZE 32
         char buf[BUFSIZE];
         int c = 0;
         if (win->info_args.accumulate_ordering & MPIDI_ACC_ORDER_RAR)
-            c += snprintf(buf+c, BUFSIZE-c, "%srar", (c > 0) ? "," : "");
+            c += snprintf(buf + c, BUFSIZE - c, "%srar", (c > 0) ? "," : "");
         if (win->info_args.accumulate_ordering & MPIDI_ACC_ORDER_RAW)
-            c += snprintf(buf+c, BUFSIZE-c, "%sraw", (c > 0) ? "," : "");
+            c += snprintf(buf + c, BUFSIZE - c, "%sraw", (c > 0) ? "," : "");
         if (win->info_args.accumulate_ordering & MPIDI_ACC_ORDER_WAR)
-            c += snprintf(buf+c, BUFSIZE-c, "%swar", (c > 0) ? "," : "");
+            c += snprintf(buf + c, BUFSIZE - c, "%swar", (c > 0) ? "," : "");
         if (win->info_args.accumulate_ordering & MPIDI_ACC_ORDER_WAW)
-            c += snprintf(buf+c, BUFSIZE-c, "%swaw", (c > 0) ? "," : "");
+            c += snprintf(buf + c, BUFSIZE - c, "%swaw", (c > 0) ? "," : "");
 
         MPIR_Info_set_impl(*info_used, "accumulate_ordering", buf);
-        if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIU_ERR_POP(mpi_errno);
+        }
 #undef BUFSIZE
     }
 
@@ -409,14 +422,18 @@ int MPIDI_Win_get_info(MPID_Win *win, MPID_Info **info_used)
     else
         mpi_errno = MPIR_Info_set_impl(*info_used, "accumulate_ops", "same_op_no_op");
 
-    if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIU_ERR_POP(mpi_errno);
+    }
 
     if (win->info_args.alloc_shm == TRUE)
         mpi_errno = MPIR_Info_set_impl(*info_used, "alloc_shm", "true");
     else
         mpi_errno = MPIR_Info_set_impl(*info_used, "alloc_shm", "false");
 
-    if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+    if (mpi_errno != MPI_SUCCESS) {
+        MPIU_ERR_POP(mpi_errno);
+    }
 
     if (win->create_flavor == MPI_WIN_FLAVOR_SHARED) {
         if (win->info_args.alloc_shared_noncontig)
@@ -424,7 +441,9 @@ int MPIDI_Win_get_info(MPID_Win *win, MPID_Info **info_used)
         else
             mpi_errno = MPIR_Info_set_impl(*info_used, "alloc_shared_noncontig", "false");
 
-        if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIU_ERR_POP(mpi_errno);
+        }
     }
     else if (win->create_flavor == MPI_WIN_FLAVOR_ALLOCATE) {
         if (win->info_args.same_size)
@@ -432,12 +451,14 @@ int MPIDI_Win_get_info(MPID_Win *win, MPID_Info **info_used)
         else
             mpi_errno = MPIR_Info_set_impl(*info_used, "same_size", "false");
 
-        if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+        if (mpi_errno != MPI_SUCCESS) {
+            MPIU_ERR_POP(mpi_errno);
+        }
     }
 
- fn_exit:
+  fn_exit:
     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_WIN_GET_INFO);
     return mpi_errno;
- fn_fail:
+  fn_fail:
     goto fn_exit;
 }
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index 0c1a646..28db10d 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -43,7 +43,7 @@ MPIU_THREADSAFE_INIT_DECL(initRMAoptions);
 
 MPIDI_RMA_Win_list_t *MPIDI_RMA_Win_list = NULL, *MPIDI_RMA_Win_list_tail = NULL;
 
-static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model, MPID_Info *info,
+static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model, MPID_Info * info,
                     MPID_Comm * comm_ptr, MPID_Win ** win_ptr);
 
 
@@ -139,7 +139,8 @@ int MPID_Win_allocate(MPI_Aint size, int disp_unit, MPID_Info * info,
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPID_WIN_ALLOCATE);
 
     mpi_errno =
-        win_init(size, disp_unit, MPI_WIN_FLAVOR_ALLOCATE, MPI_WIN_UNIFIED, info, comm_ptr, win_ptr);
+        win_init(size, disp_unit, MPI_WIN_FLAVOR_ALLOCATE, MPI_WIN_UNIFIED, info, comm_ptr,
+                 win_ptr);
     if (mpi_errno != MPI_SUCCESS) {
         MPIU_ERR_POP(mpi_errno);
     }
@@ -261,7 +262,7 @@ int MPID_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Info * info, MPI
 #define FUNCNAME win_init
 #undef FCNAME
 #define FCNAME MPIU_QUOTE(FUNCNAME)
-static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model, MPID_Info *info,
+static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model, MPID_Info * info,
                     MPID_Comm * comm_ptr, MPID_Win ** win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
@@ -341,7 +342,8 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
 
     /* Set info_args on window based on info provided by user */
     mpi_errno = (*win_ptr)->RMAFns.Win_set_info((*win_ptr), info);
-    if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    if (mpi_errno != MPI_SUCCESS)
+        MPIU_ERR_POP(mpi_errno);
 
     MPIU_CHKPMEM_MALLOC((*win_ptr)->op_pool_start, MPIDI_RMA_Op_t *,
                         sizeof(MPIDI_RMA_Op_t) * MPIR_CVAR_CH3_RMA_OP_WIN_POOL_SIZE, mpi_errno,
@@ -350,18 +352,21 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->op_pool_tail = NULL;
     for (i = 0; i < MPIR_CVAR_CH3_RMA_OP_WIN_POOL_SIZE; i++) {
         (*win_ptr)->op_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN;
-        MPL_LL_APPEND((*win_ptr)->op_pool, (*win_ptr)->op_pool_tail, &((*win_ptr)->op_pool_start[i]));
+        MPL_LL_APPEND((*win_ptr)->op_pool, (*win_ptr)->op_pool_tail,
+                      &((*win_ptr)->op_pool_start[i]));
     }
 
-    win_target_pool_size = MPIR_MIN(MPIR_CVAR_CH3_RMA_TARGET_WIN_POOL_SIZE, MPIR_Comm_size(win_comm_ptr));
+    win_target_pool_size =
+        MPIR_MIN(MPIR_CVAR_CH3_RMA_TARGET_WIN_POOL_SIZE, MPIR_Comm_size(win_comm_ptr));
     MPIU_CHKPMEM_MALLOC((*win_ptr)->target_pool_start, MPIDI_RMA_Target_t *,
-                        sizeof(MPIDI_RMA_Target_t) * win_target_pool_size,
-                        mpi_errno, "RMA target pool");
+                        sizeof(MPIDI_RMA_Target_t) * win_target_pool_size, mpi_errno,
+                        "RMA target pool");
     (*win_ptr)->target_pool = NULL;
     (*win_ptr)->target_pool_tail = NULL;
     for (i = 0; i < win_target_pool_size; i++) {
         (*win_ptr)->target_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN;
-        MPL_LL_APPEND((*win_ptr)->target_pool, (*win_ptr)->target_pool_tail, &((*win_ptr)->target_pool_start[i]));
+        MPL_LL_APPEND((*win_ptr)->target_pool, (*win_ptr)->target_pool_tail,
+                      &((*win_ptr)->target_pool_start[i]));
     }
 
     (*win_ptr)->num_slots = MPIR_MIN(MPIR_CVAR_CH3_RMA_SLOTS_SIZE, MPIR_Comm_size(win_comm_ptr));
@@ -373,15 +378,16 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     }
 
     if (!(*win_ptr)->info_args.no_locks) {
-    MPIU_CHKPMEM_MALLOC((*win_ptr)->lock_entry_pool_start, MPIDI_RMA_Lock_entry_t *,
-                        sizeof(MPIDI_RMA_Lock_entry_t) * MPIR_CVAR_CH3_RMA_LOCK_ENTRY_WIN_POOL_SIZE,
-                        mpi_errno, "RMA lock entry pool");
-    (*win_ptr)->lock_entry_pool = NULL;
-    (*win_ptr)->lock_entry_pool_tail = NULL;
-    for (i = 0; i < MPIR_CVAR_CH3_RMA_LOCK_ENTRY_WIN_POOL_SIZE; i++) {
-        MPL_LL_APPEND((*win_ptr)->lock_entry_pool, (*win_ptr)->lock_entry_pool_tail,
-                      &((*win_ptr)->lock_entry_pool_start[i]));
-    }
+        MPIU_CHKPMEM_MALLOC((*win_ptr)->lock_entry_pool_start, MPIDI_RMA_Lock_entry_t *,
+                            sizeof(MPIDI_RMA_Lock_entry_t) *
+                            MPIR_CVAR_CH3_RMA_LOCK_ENTRY_WIN_POOL_SIZE, mpi_errno,
+                            "RMA lock entry pool");
+        (*win_ptr)->lock_entry_pool = NULL;
+        (*win_ptr)->lock_entry_pool_tail = NULL;
+        for (i = 0; i < MPIR_CVAR_CH3_RMA_LOCK_ENTRY_WIN_POOL_SIZE; i++) {
+            MPL_LL_APPEND((*win_ptr)->lock_entry_pool, (*win_ptr)->lock_entry_pool_tail,
+                          &((*win_ptr)->lock_entry_pool_start[i]));
+        }
     }
 
     /* enqueue window into the global list */
diff --git a/src/mpid/ch3/src/mpidi_printf.c b/src/mpid/ch3/src/mpidi_printf.c
index 8b19ae6..6e7e17b 100644
--- a/src/mpid/ch3/src/mpidi_printf.c
+++ b/src/mpid/ch3/src/mpidi_printf.c
@@ -11,11 +11,11 @@
 /* style: allow:vprintf:1 sig:0 */
 /* style: allow:printf:2 sig:0 */
 
-/* FIXME: What are these routines for?  Who uses them?  Why are they different 
+/* FIXME: What are these routines for?  Who uses them?  Why are they different
    from the src/util/dbg routines? */
 
 /*
- * Note on thread safety.  These routines originally used 
+ * Note on thread safety.  These routines originally used
  * MPID_Common_thread_lock/unlock, but that lock was not defined or used
  * consistently with the global mutex approach (now defined as
  * SINGLE_CS_ENTER/EXIT).  As these debugging routines should also
@@ -25,52 +25,48 @@
 
 /* --BEGIN DEBUG-- */
 #undef MPIDI_dbg_printf
-void MPIDI_dbg_printf(int level, char * func, char * fmt, ...)
+void MPIDI_dbg_printf(int level, char *func, char *fmt, ...)
 {
-    /* FIXME: This "unreferenced_arg" is an example of a problem with the 
-       API (unneeded level argument) or the code (failure to check the 
-       level argument).  Inserting these "unreference_arg" macros erroneously
-       suggests that the code is correct with this ununsed argument, and thus
-       commits the grave harm of obscuring a real problem */
+    /* FIXME: This "unreferenced_arg" is an example of a problem with the
+     * API (unneeded level argument) or the code (failure to check the
+     * level argument).  Inserting these "unreference_arg" macros erroneously
+     * suggests that the code is correct with this ununsed argument, and thus
+     * commits the grave harm of obscuring a real problem */
     MPIU_UNREFERENCED_ARG(level);
     {
-	va_list list;
+        va_list list;
 
-	if (MPIR_Process.comm_world)
-	{
-	    MPIU_dbglog_printf("[%d] %s(): ", MPIR_Process.comm_world->rank, func);
-	}
-	else
-	{
-	    MPIU_dbglog_printf("[-1] %s(): ", func);
-	}
-	va_start(list, fmt);
-	MPIU_dbglog_vprintf(fmt, list);
-	va_end(list);
-	MPIU_dbglog_printf("\n");
-	fflush(stdout);
+        if (MPIR_Process.comm_world) {
+            MPIU_dbglog_printf("[%d] %s(): ", MPIR_Process.comm_world->rank, func);
+        }
+        else {
+            MPIU_dbglog_printf("[-1] %s(): ", func);
+        }
+        va_start(list, fmt);
+        MPIU_dbglog_vprintf(fmt, list);
+        va_end(list);
+        MPIU_dbglog_printf("\n");
+        fflush(stdout);
     }
 }
 
 #undef MPIDI_err_printf
-void MPIDI_err_printf(char * func, char * fmt, ...)
+void MPIDI_err_printf(char *func, char *fmt, ...)
 {
     {
-	va_list list;
+        va_list list;
 
-	if (MPIR_Process.comm_world)
-	{
-	    printf("[%d] ERROR - %s(): ", MPIR_Process.comm_world->rank, func);
-	}
-	else
-	{
-	    printf("[-1] ERROR - %s(): ", func);
-	}
-	va_start(list, fmt);
-	vprintf(fmt, list);
-	va_end(list);
-	printf("\n");
-	fflush(stdout);
+        if (MPIR_Process.comm_world) {
+            printf("[%d] ERROR - %s(): ", MPIR_Process.comm_world->rank, func);
+        }
+        else {
+            printf("[-1] ERROR - %s(): ", func);
+        }
+        va_start(list, fmt);
+        vprintf(fmt, list);
+        va_end(list);
+        printf("\n");
+        fflush(stdout);
     }
 }
 
@@ -81,7 +77,7 @@ void MPIDI_err_printf(char * func, char * fmt, ...)
    packet type, could be used.
    Also, these routines should not use MPIU_DBG_PRINTF, instead they should
    us a simple fprintf with a style allowance (so that the style checker
-   won't flag the use as a possible problem).  
+   won't flag the use as a possible problem).
 
    This should switch to using a table of functions
 
@@ -90,274 +86,248 @@ void MPIDI_err_printf(char * func, char * fmt, ...)
 */
 
 #ifdef MPICH_DBG_OUTPUT
-void MPIDI_DBG_Print_packet(MPIDI_CH3_Pkt_t *pkt)
+void MPIDI_DBG_Print_packet(MPIDI_CH3_Pkt_t * pkt)
 {
     {
-	MPIU_DBG_PRINTF(("MPIDI_CH3_Pkt_t:\n"));
-	switch(pkt->type)
-	{
-	    case MPIDI_CH3_PKT_EAGER_SEND:
-		MPIDI_CH3_PktPrint_EagerSend( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_READY_SEND:
-		MPIDI_CH3_PktPrint_ReadySend( stdout, pkt );
-		break;
+        MPIU_DBG_PRINTF(("MPIDI_CH3_Pkt_t:\n"));
+        switch (pkt->type) {
+        case MPIDI_CH3_PKT_EAGER_SEND:
+            MPIDI_CH3_PktPrint_EagerSend(stdout, pkt);
+            break;
+        case MPIDI_CH3_PKT_READY_SEND:
+            MPIDI_CH3_PktPrint_ReadySend(stdout, pkt);
+            break;
 
-	    case MPIDI_CH3_PKT_EAGER_SYNC_SEND:
-		MPIDI_CH3_PktPrint_EagerSyncSend( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_EAGER_SYNC_ACK:
-		MPIDI_CH3_PktPrint_EagerSyncAck( stdout, pkt );
-		break;
+        case MPIDI_CH3_PKT_EAGER_SYNC_SEND:
+            MPIDI_CH3_PktPrint_EagerSyncSend(stdout, pkt);
+            break;
+        case MPIDI_CH3_PKT_EAGER_SYNC_ACK:
+            MPIDI_CH3_PktPrint_EagerSyncAck(stdout, pkt);
+            break;
 
-	    case MPIDI_CH3_PKT_RNDV_REQ_TO_SEND:
-		MPIDI_CH3_PktPrint_RndvReqToSend( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_RNDV_CLR_TO_SEND:
-		MPIDI_CH3_PktPrint_RndvClrToSend( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_RNDV_SEND:
-		MPIDI_CH3_PktPrint_RndvSend( stdout, pkt );
-		break;
+        case MPIDI_CH3_PKT_RNDV_REQ_TO_SEND:
+            MPIDI_CH3_PktPrint_RndvReqToSend(stdout, pkt);
+            break;
+        case MPIDI_CH3_PKT_RNDV_CLR_TO_SEND:
+            MPIDI_CH3_PktPrint_RndvClrToSend(stdout, pkt);
+            break;
+        case MPIDI_CH3_PKT_RNDV_SEND:
+            MPIDI_CH3_PktPrint_RndvSend(stdout, pkt);
+            break;
 
-	    case MPIDI_CH3_PKT_CANCEL_SEND_REQ:
-		MPIDI_CH3_PktPrint_CancelSendReq( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_CANCEL_SEND_RESP:
-		MPIDI_CH3_PktPrint_CancelSendResp( stdout, pkt );
-		break;
+        case MPIDI_CH3_PKT_CANCEL_SEND_REQ:
+            MPIDI_CH3_PktPrint_CancelSendReq(stdout, pkt);
+            break;
+        case MPIDI_CH3_PKT_CANCEL_SEND_RESP:
+            MPIDI_CH3_PktPrint_CancelSendResp(stdout, pkt);
+            break;
 
-		/* FIXME: Move these RMA descriptions into the RMA code files */
-	    case MPIDI_CH3_PKT_PUT:
-		MPIDI_CH3_PktPrint_Put( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_GET:
-		MPIDI_CH3_PktPrint_Get( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_GET_RESP:
-		MPIDI_CH3_PktPrint_GetResp( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_ACCUMULATE:
-		MPIDI_CH3_PktPrint_Accumulate( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_LOCK:
-		MPIDI_CH3_PktPrint_Lock( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_FLUSH_ACK:
-		MPIDI_CH3_PktPrint_FlushAck( stdout, pkt );
-		break;
-	    case MPIDI_CH3_PKT_LOCK_ACK:
-		MPIDI_CH3_PktPrint_LockAck( stdout, pkt );
-		break;
-		/*
-	    case MPIDI_CH3_PKT_SHARED_LOCK_OPS_DONE:
-		MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_SHARED_LOCK_OPS_DONE\n"));
-		MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->shared_lock_ops_done.source_win_handle));
-		break;
-		*/
-	    case MPIDI_CH3_PKT_FLOW_CNTL_UPDATE:
-		MPIU_DBG_PRINTF((" FLOW_CNTRL_UPDATE\n"));
-		break;
+            /* FIXME: Move these RMA descriptions into the RMA code files */
+        case MPIDI_CH3_PKT_PUT:
+            MPIDI_CH3_PktPrint_Put(stdout, pkt);
+            break;
+        case MPIDI_CH3_PKT_GET:
+            MPIDI_CH3_PktPrint_Get(stdout, pkt);
+            break;
+        case MPIDI_CH3_PKT_GET_RESP:
+            MPIDI_CH3_PktPrint_GetResp(stdout, pkt);
+            break;
+        case MPIDI_CH3_PKT_ACCUMULATE:
+            MPIDI_CH3_PktPrint_Accumulate(stdout, pkt);
+            break;
+        case MPIDI_CH3_PKT_LOCK:
+            MPIDI_CH3_PktPrint_Lock(stdout, pkt);
+            break;
+        case MPIDI_CH3_PKT_FLUSH_ACK:
+            MPIDI_CH3_PktPrint_FlushAck(stdout, pkt);
+            break;
+        case MPIDI_CH3_PKT_LOCK_ACK:
+            MPIDI_CH3_PktPrint_LockAck(stdout, pkt);
+            break;
+            /*
+             * case MPIDI_CH3_PKT_SHARED_LOCK_OPS_DONE:
+             * MPIU_DBG_PRINTF((" type ......... MPIDI_CH3_PKT_SHARED_LOCK_OPS_DONE\n"));
+             * MPIU_DBG_PRINTF((" source ....... 0x%08X\n", pkt->shared_lock_ops_done.source_win_handle));
+             * break;
+             */
+        case MPIDI_CH3_PKT_FLOW_CNTL_UPDATE:
+            MPIU_DBG_PRINTF((" FLOW_CNTRL_UPDATE\n"));
+            break;
 
-	    case MPIDI_CH3_PKT_CLOSE:
-		MPIDI_CH3_PktPrint_Close( stdout, pkt );
-		break;
-	    
-	    default:
-		MPIU_DBG_PRINTF((" INVALID PACKET\n"));
-		MPIU_DBG_PRINTF((" unknown type ... %d\n", pkt->type));
-		MPIU_DBG_PRINTF(("  type .......... EAGER_SEND\n"));
-		MPIU_DBG_PRINTF(("   sender_reqid . 0x%08X\n", pkt->eager_send.sender_req_id));
-		MPIU_DBG_PRINTF(("   context_id ... %d\n", pkt->eager_send.match.parts.context_id));
-		MPIU_DBG_PRINTF(("   data_sz ...... %d\n", pkt->eager_send.data_sz));
-		MPIU_DBG_PRINTF(("   tag .......... %d\n", pkt->eager_send.match.parts.tag));
-		MPIU_DBG_PRINTF(("   rank ......... %d\n", pkt->eager_send.match.parts.rank));
+        case MPIDI_CH3_PKT_CLOSE:
+            MPIDI_CH3_PktPrint_Close(stdout, pkt);
+            break;
+
+        default:
+            MPIU_DBG_PRINTF((" INVALID PACKET\n"));
+            MPIU_DBG_PRINTF((" unknown type ... %d\n", pkt->type));
+            MPIU_DBG_PRINTF(("  type .......... EAGER_SEND\n"));
+            MPIU_DBG_PRINTF(("   sender_reqid . 0x%08X\n", pkt->eager_send.sender_req_id));
+            MPIU_DBG_PRINTF(("   context_id ... %d\n", pkt->eager_send.match.parts.context_id));
+            MPIU_DBG_PRINTF(("   data_sz ...... %d\n", pkt->eager_send.data_sz));
+            MPIU_DBG_PRINTF(("   tag .......... %d\n", pkt->eager_send.match.parts.tag));
+            MPIU_DBG_PRINTF(("   rank ......... %d\n", pkt->eager_send.match.parts.rank));
 #ifdef MPID_USE_SEQUENCE_NUMBERS
-		MPIU_DBG_PRINTF(("   seqnum ....... %d\n", pkt->eager_send.seqnum));
+            MPIU_DBG_PRINTF(("   seqnum ....... %d\n", pkt->eager_send.seqnum));
 #endif
-		MPIU_DBG_PRINTF(("  type .......... REQ_TO_SEND\n"));
-		MPIU_DBG_PRINTF(("   sender_reqid . 0x%08X\n", pkt->rndv_req_to_send.sender_req_id));
-		MPIU_DBG_PRINTF(("   context_id ... %d\n", pkt->rndv_req_to_send.match.parts.context_id));
-		MPIU_DBG_PRINTF(("   data_sz ...... %d\n", pkt->rndv_req_to_send.data_sz));
-		MPIU_DBG_PRINTF(("   tag .......... %d\n", pkt->rndv_req_to_send.match.parts.tag));
-		MPIU_DBG_PRINTF(("   rank ......... %d\n", pkt->rndv_req_to_send.match.parts.rank));
+            MPIU_DBG_PRINTF(("  type .......... REQ_TO_SEND\n"));
+            MPIU_DBG_PRINTF(("   sender_reqid . 0x%08X\n", pkt->rndv_req_to_send.sender_req_id));
+            MPIU_DBG_PRINTF(("   context_id ... %d\n",
+                             pkt->rndv_req_to_send.match.parts.context_id));
+            MPIU_DBG_PRINTF(("   data_sz ...... %d\n", pkt->rndv_req_to_send.data_sz));
+            MPIU_DBG_PRINTF(("   tag .......... %d\n", pkt->rndv_req_to_send.match.parts.tag));
+            MPIU_DBG_PRINTF(("   rank ......... %d\n", pkt->rndv_req_to_send.match.parts.rank));
 #ifdef MPID_USE_SEQUENCE_NUMBERS
-		MPIU_DBG_PRINTF(("   seqnum ....... %d\n", pkt->rndv_req_to_send.seqnum));
+            MPIU_DBG_PRINTF(("   seqnum ....... %d\n", pkt->rndv_req_to_send.seqnum));
 #endif
-		MPIU_DBG_PRINTF(("  type .......... CLR_TO_SEND\n"));
-		MPIU_DBG_PRINTF(("   sender_reqid . 0x%08X\n", pkt->rndv_clr_to_send.sender_req_id));
-		MPIU_DBG_PRINTF(("   recvr_reqid .. 0x%08X\n", pkt->rndv_clr_to_send.receiver_req_id));
-		MPIU_DBG_PRINTF(("  type .......... RNDV_SEND\n"));
-		MPIU_DBG_PRINTF(("   recvr_reqid .. 0x%08X\n", pkt->rndv_send.receiver_req_id));
-		MPIU_DBG_PRINTF(("  type .......... CANCEL_SEND\n"));
-		MPIU_DBG_PRINTF(("   context_id ... %d\n", pkt->cancel_send_req.match.parts.context_id));
-		MPIU_DBG_PRINTF(("   tag .......... %d\n", pkt->cancel_send_req.match.parts.tag));
-		MPIU_DBG_PRINTF(("   rank ......... %d\n", pkt->cancel_send_req.match.parts.rank));
-		MPIU_DBG_PRINTF(("   sender_reqid . 0x%08X\n", pkt->cancel_send_req.sender_req_id));
-		MPIU_DBG_PRINTF(("  type .......... CANCEL_SEND_RESP\n"));
-		MPIU_DBG_PRINTF(("   sender_reqid . 0x%08X\n", pkt->cancel_send_resp.sender_req_id));
-		MPIU_DBG_PRINTF(("   ack .......... %d\n", pkt->cancel_send_resp.ack));
-		break;
-	}
+            MPIU_DBG_PRINTF(("  type .......... CLR_TO_SEND\n"));
+            MPIU_DBG_PRINTF(("   sender_reqid . 0x%08X\n", pkt->rndv_clr_to_send.sender_req_id));
+            MPIU_DBG_PRINTF(("   recvr_reqid .. 0x%08X\n", pkt->rndv_clr_to_send.receiver_req_id));
+            MPIU_DBG_PRINTF(("  type .......... RNDV_SEND\n"));
+            MPIU_DBG_PRINTF(("   recvr_reqid .. 0x%08X\n", pkt->rndv_send.receiver_req_id));
+            MPIU_DBG_PRINTF(("  type .......... CANCEL_SEND\n"));
+            MPIU_DBG_PRINTF(("   context_id ... %d\n",
+                             pkt->cancel_send_req.match.parts.context_id));
+            MPIU_DBG_PRINTF(("   tag .......... %d\n", pkt->cancel_send_req.match.parts.tag));
+            MPIU_DBG_PRINTF(("   rank ......... %d\n", pkt->cancel_send_req.match.parts.rank));
+            MPIU_DBG_PRINTF(("   sender_reqid . 0x%08X\n", pkt->cancel_send_req.sender_req_id));
+            MPIU_DBG_PRINTF(("  type .......... CANCEL_SEND_RESP\n"));
+            MPIU_DBG_PRINTF(("   sender_reqid . 0x%08X\n", pkt->cancel_send_resp.sender_req_id));
+            MPIU_DBG_PRINTF(("   ack .......... %d\n", pkt->cancel_send_resp.ack));
+            break;
+        }
     }
 }
 #endif
 
 
-const char * MPIDI_VC_GetStateString(int state)
+const char *MPIDI_VC_GetStateString(int state)
 {
-    switch (state)
-    {
-	case MPIDI_VC_STATE_INACTIVE:
-	    return "MPIDI_VC_STATE_INACTIVE";
-	case MPIDI_VC_STATE_INACTIVE_CLOSED:
-	    return "MPIDI_VC_STATE_INACTIVE_CLOSED";
-	case MPIDI_VC_STATE_ACTIVE:
-	    return "MPIDI_VC_STATE_ACTIVE";
-	case MPIDI_VC_STATE_LOCAL_CLOSE:
-	    return "MPIDI_VC_STATE_LOCAL_CLOSE";
-	case MPIDI_VC_STATE_REMOTE_CLOSE:
-	    return "MPIDI_VC_STATE_REMOTE_CLOSE";
-	case MPIDI_VC_STATE_CLOSE_ACKED:
-	    return "MPIDI_VC_STATE_CLOSE_ACKED";
-	case MPIDI_VC_STATE_CLOSED:
-	    return "MPIDI_VC_STATE_CLOSED";
-        case MPIDI_VC_STATE_MORIBUND:
-	    return "MPIDI_VC_STATE_MORIBUND";
-	default:
-	    return "unknown";
+    switch (state) {
+    case MPIDI_VC_STATE_INACTIVE:
+        return "MPIDI_VC_STATE_INACTIVE";
+    case MPIDI_VC_STATE_INACTIVE_CLOSED:
+        return "MPIDI_VC_STATE_INACTIVE_CLOSED";
+    case MPIDI_VC_STATE_ACTIVE:
+        return "MPIDI_VC_STATE_ACTIVE";
+    case MPIDI_VC_STATE_LOCAL_CLOSE:
+        return "MPIDI_VC_STATE_LOCAL_CLOSE";
+    case MPIDI_VC_STATE_REMOTE_CLOSE:
+        return "MPIDI_VC_STATE_REMOTE_CLOSE";
+    case MPIDI_VC_STATE_CLOSE_ACKED:
+        return "MPIDI_VC_STATE_CLOSE_ACKED";
+    case MPIDI_VC_STATE_CLOSED:
+        return "MPIDI_VC_STATE_CLOSED";
+    case MPIDI_VC_STATE_MORIBUND:
+        return "MPIDI_VC_STATE_MORIBUND";
+    default:
+        return "unknown";
     }
 }
 
 /* This routine is not thread safe and should only be used while
    debugging.  It is used to encode a brief description of a message
    packet into a string to make it easy to include in the message log
-   output (with no newlines to simplify extracting info from the log file) 
+   output (with no newlines to simplify extracting info from the log file)
 */
-const char *MPIDI_Pkt_GetDescString( MPIDI_CH3_Pkt_t *pkt ) 
+const char *MPIDI_Pkt_GetDescString(MPIDI_CH3_Pkt_t * pkt)
 {
     static char pktmsg[256];
 
     /* For data messages, the string (...) is (context,tag,rank,size) */
-    switch(pkt->type) {
+    switch (pkt->type) {
     case MPIDI_CH3_PKT_EAGER_SEND:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "EAGER_SEND - (%d,%d,%d,)" MPIDI_MSG_SZ_FMT, 
-		       pkt->eager_send.match.parts.context_id,
-		       (int)pkt->eager_send.match.parts.tag, 
-		       pkt->eager_send.match.parts.rank, 
-		       pkt->eager_send.data_sz );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "EAGER_SEND - (%d,%d,%d,)" MPIDI_MSG_SZ_FMT,
+                      pkt->eager_send.match.parts.context_id,
+                      (int) pkt->eager_send.match.parts.tag,
+                      pkt->eager_send.match.parts.rank, pkt->eager_send.data_sz);
+        break;
     case MPIDI_CH3_PKT_EAGER_SYNC_SEND:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "EAGER_SYNC_SEND - (%d,%d,%d,)" MPIDI_MSG_SZ_FMT " req=%d", 
-		       pkt->eager_sync_send.match.parts.context_id,
-		       (int)pkt->eager_sync_send.match.parts.tag, 
-		       pkt->eager_sync_send.match.parts.rank, 
-		       pkt->eager_sync_send.data_sz,
-		       pkt->eager_sync_send.sender_req_id );
-		break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "EAGER_SYNC_SEND - (%d,%d,%d,)" MPIDI_MSG_SZ_FMT " req=%d",
+                      pkt->eager_sync_send.match.parts.context_id,
+                      (int) pkt->eager_sync_send.match.parts.tag,
+                      pkt->eager_sync_send.match.parts.rank,
+                      pkt->eager_sync_send.data_sz, pkt->eager_sync_send.sender_req_id);
+        break;
     case MPIDI_CH3_PKT_EAGER_SYNC_ACK:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "EAGER_SYNC_ACK - req=%d", 
-		       pkt->eager_sync_ack.sender_req_id );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "EAGER_SYNC_ACK - req=%d", pkt->eager_sync_ack.sender_req_id);
+        break;
     case MPIDI_CH3_PKT_READY_SEND:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "READY_SEND - (%d,%d,%d,)" MPIDI_MSG_SZ_FMT, 
-		       pkt->ready_send.match.parts.context_id,
-		       (int)pkt->ready_send.match.parts.tag, 
-		       pkt->ready_send.match.parts.rank, 
-		       pkt->ready_send.data_sz );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "READY_SEND - (%d,%d,%d,)" MPIDI_MSG_SZ_FMT,
+                      pkt->ready_send.match.parts.context_id,
+                      (int) pkt->ready_send.match.parts.tag,
+                      pkt->ready_send.match.parts.rank, pkt->ready_send.data_sz);
+        break;
     case MPIDI_CH3_PKT_RNDV_REQ_TO_SEND:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "RNDV_REQ_TO_SEND - (%d,%d,%d,)" MPIDI_MSG_SZ_FMT " req=%d", 
-		       pkt->rndv_req_to_send.match.parts.context_id,
-		       (int)pkt->rndv_req_to_send.match.parts.tag, 
-		       pkt->rndv_req_to_send.match.parts.rank, 
-		       pkt->rndv_req_to_send.data_sz,
-		       pkt->rndv_req_to_send.sender_req_id );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "RNDV_REQ_TO_SEND - (%d,%d,%d,)" MPIDI_MSG_SZ_FMT " req=%d",
+                      pkt->rndv_req_to_send.match.parts.context_id,
+                      (int) pkt->rndv_req_to_send.match.parts.tag,
+                      pkt->rndv_req_to_send.match.parts.rank,
+                      pkt->rndv_req_to_send.data_sz, pkt->rndv_req_to_send.sender_req_id);
+        break;
     case MPIDI_CH3_PKT_RNDV_CLR_TO_SEND:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "RNDV_CLRTO_SEND - req=%d, recv req=%d", 
-		       pkt->rndv_clr_to_send.sender_req_id,
-		       pkt->rndv_clr_to_send.receiver_req_id );
-		break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "RNDV_CLRTO_SEND - req=%d, recv req=%d",
+                      pkt->rndv_clr_to_send.sender_req_id, pkt->rndv_clr_to_send.receiver_req_id);
+        break;
     case MPIDI_CH3_PKT_RNDV_SEND:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "RNDV_SEND - recv req=%d", 
-		       pkt->rndv_send.receiver_req_id );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "RNDV_SEND - recv req=%d", pkt->rndv_send.receiver_req_id);
+        break;
     case MPIDI_CH3_PKT_CANCEL_SEND_REQ:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "CANCEL_SEND_REQ - req=%d", 
-		       pkt->cancel_send_req.sender_req_id );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "CANCEL_SEND_REQ - req=%d", pkt->cancel_send_req.sender_req_id);
+        break;
     case MPIDI_CH3_PKT_CANCEL_SEND_RESP:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "CANCEL_SEND_RESP - req=%d ack=%d", 
-		       pkt->cancel_send_resp.sender_req_id, 
-		       pkt->cancel_send_resp.ack );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "CANCEL_SEND_RESP - req=%d ack=%d",
+                      pkt->cancel_send_resp.sender_req_id, pkt->cancel_send_resp.ack);
+        break;
     case MPIDI_CH3_PKT_PUT:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "PUT - (%p,%d,0x%08X)", 
-		       pkt->put.addr, 
-		       pkt->put.count,
-		       pkt->put.target_win_handle );
-		break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "PUT - (%p,%d,0x%08X)",
+                      pkt->put.addr, pkt->put.count, pkt->put.target_win_handle);
+        break;
     case MPIDI_CH3_PKT_GET:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "GET - (%p,%d,0x%08X) req=%d", 
-		       pkt->get.addr, 
-		       pkt->get.count,
-		       pkt->get.target_win_handle,
-		       pkt->get.request_handle );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "GET - (%p,%d,0x%08X) req=%d",
+                      pkt->get.addr,
+                      pkt->get.count, pkt->get.target_win_handle, pkt->get.request_handle);
+        break;
     case MPIDI_CH3_PKT_GET_RESP:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "GET_RESP - req=%d", 
-		       pkt->get_resp.request_handle );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg), "GET_RESP - req=%d", pkt->get_resp.request_handle);
+        break;
     case MPIDI_CH3_PKT_ACCUMULATE:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "ACCUMULATE - (%p,%d,0x%08X)", 
-		       pkt->accum.addr,
-		       pkt->accum.count, 
-		       pkt->accum.target_win_handle );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "ACCUMULATE - (%p,%d,0x%08X)",
+                      pkt->accum.addr, pkt->accum.count, pkt->accum.target_win_handle);
+        break;
     case MPIDI_CH3_PKT_LOCK:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "LOCK - %d", 
-		       pkt->lock.target_win_handle );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg), "LOCK - %d", pkt->lock.target_win_handle);
+        break;
     case MPIDI_CH3_PKT_FLUSH_ACK:
-	/* There is no rma_done packet type */
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "RMA_DONE - 0x%08X", 
-		       pkt->flush_ack.source_win_handle );
-	break;
+        /* There is no rma_done packet type */
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg),
+                      "RMA_DONE - 0x%08X", pkt->flush_ack.source_win_handle);
+        break;
     case MPIDI_CH3_PKT_LOCK_ACK:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "LOCK_ACK - 0x%08X",
-		       pkt->lock_ack.source_win_handle );
-		break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg), "LOCK_ACK - 0x%08X", pkt->lock_ack.source_win_handle);
+        break;
     case MPIDI_CH3_PKT_FLOW_CNTL_UPDATE:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "FLOW_CNTL_UPDATE" );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg), "FLOW_CNTL_UPDATE");
+        break;
     case MPIDI_CH3_PKT_CLOSE:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "CLOSE ack=%d", 
-		       pkt->close.ack );
-	break;
-	    
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg), "CLOSE ack=%d", pkt->close.ack);
+        break;
+
     default:
-	MPIU_Snprintf( pktmsg, sizeof(pktmsg), 
-		       "INVALID PACKET type=%d", pkt->type );
-	break;
+        MPIU_Snprintf(pktmsg, sizeof(pktmsg), "INVALID PACKET type=%d", pkt->type);
+        break;
     }
 
     return pktmsg;
diff --git a/src/mpid/ch3/src/mpidi_rma.c b/src/mpid/ch3/src/mpidi_rma.c
index 408070d..d55b7a9 100644
--- a/src/mpid/ch3/src/mpidi_rma.c
+++ b/src/mpid/ch3/src/mpidi_rma.c
@@ -91,8 +91,10 @@ cvars:
 */
 
 
-MPIDI_RMA_Op_t *global_rma_op_pool = NULL, *global_rma_op_pool_tail = NULL, *global_rma_op_pool_start = NULL;
-MPIDI_RMA_Target_t *global_rma_target_pool = NULL, *global_rma_target_pool_tail = NULL, *global_rma_target_pool_start = NULL;
+MPIDI_RMA_Op_t *global_rma_op_pool = NULL, *global_rma_op_pool_tail =
+    NULL, *global_rma_op_pool_start = NULL;
+MPIDI_RMA_Target_t *global_rma_target_pool = NULL, *global_rma_target_pool_tail =
+    NULL, *global_rma_target_pool_start = NULL;
 MPIDI_RMA_Pkt_orderings_t *MPIDI_RMA_Pkt_orderings = NULL;
 
 #undef FUNCNAME
@@ -122,14 +124,14 @@ int MPIDI_RMA_init(void)
                         mpi_errno, "RMA target pool");
     for (i = 0; i < MPIR_CVAR_CH3_RMA_TARGET_GLOBAL_POOL_SIZE; i++) {
         global_rma_target_pool_start[i].pool_type = MPIDI_RMA_POOL_GLOBAL;
-        MPL_LL_APPEND(global_rma_target_pool, global_rma_target_pool_tail, &(global_rma_target_pool_start[i]));
+        MPL_LL_APPEND(global_rma_target_pool, global_rma_target_pool_tail,
+                      &(global_rma_target_pool_start[i]));
     }
 
     MPIU_CHKPMEM_MALLOC(MPIDI_RMA_Pkt_orderings, struct MPIDI_RMA_Pkt_orderings *,
-                        sizeof(struct MPIDI_RMA_Pkt_orderings),
-                        mpi_errno, "RMA packet orderings");
+                        sizeof(struct MPIDI_RMA_Pkt_orderings), mpi_errno, "RMA packet orderings");
     /* FIXME: here we should let channel to set ordering flags. For now we just set them
-       in CH3 layer. */
+     * in CH3 layer. */
     MPIDI_RMA_Pkt_orderings->flush_remote = 1;
 
   fn_exit:
@@ -176,8 +178,8 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FREE);
 
     /* it is possible that there is a IBARRIER in MPI_WIN_FENCE with
-       MODE_NOPRECEDE not being completed, we let the progress engine
-       to delete its request when it is completed. */
+     * MODE_NOPRECEDE not being completed, we let the progress engine
+     * to delete its request when it is completed. */
     if ((*win_ptr)->fence_sync_req != MPI_REQUEST_NULL) {
         MPID_Request *req_ptr;
         MPID_Request_get_ptr((*win_ptr)->fence_sync_req, req_ptr);
@@ -194,19 +196,18 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
     /* 1. Here we must wait until all passive locks are released on this target,
-       because for some UNLOCK messages, we do not send ACK back to origin,
-       we must wait until lock is released so that we can free window.
-       2. We also need to wait until AT completion counter being zero, because
-       this counter is increment everytime we meet a GET-like operation, it is
-       possible that when target entering Win_free, passive epoch is not finished
-       yet and there are still GETs doing on this target.
-       3. We also need to wait until lock queue becomes empty. It is possible
-       that some lock requests is still waiting in the queue when target is
-       entering Win_free. */
+     * because for some UNLOCK messages, we do not send ACK back to origin,
+     * we must wait until lock is released so that we can free window.
+     * 2. We also need to wait until AT completion counter being zero, because
+     * this counter is increment everytime we meet a GET-like operation, it is
+     * possible that when target entering Win_free, passive epoch is not finished
+     * yet and there are still GETs doing on this target.
+     * 3. We also need to wait until lock queue becomes empty. It is possible
+     * that some lock requests is still waiting in the queue when target is
+     * entering Win_free. */
     while ((*win_ptr)->current_lock_type != MPID_LOCK_NONE ||
            (*win_ptr)->at_completion_counter != 0 ||
-           (*win_ptr)->lock_queue != NULL ||
-           (*win_ptr)->current_lock_data_bytes != 0) {
+           (*win_ptr)->lock_queue != NULL || (*win_ptr)->current_lock_data_bytes != 0) {
         mpi_errno = wait_progress_engine();
         if (mpi_errno != MPI_SUCCESS)
             MPIU_ERR_POP(mpi_errno);
@@ -214,7 +215,7 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
 
     if (!(*win_ptr)->shm_allocated) {
         /* when SHM is allocated, we already did a global barrier in
-           MPIDI_CH3_SHM_Win_free, so we do not need to do it again here. */
+         * MPIDI_CH3_SHM_Win_free, so we do not need to do it again here. */
         mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
         if (mpi_errno)
             MPIU_ERR_POP(mpi_errno);
@@ -240,7 +241,7 @@ int MPIDI_Win_free(MPID_Win ** win_ptr)
     MPIU_Free((*win_ptr)->target_pool_start);
     MPIU_Free((*win_ptr)->slots);
     if (!(*win_ptr)->info_args.no_locks) {
-    MPIU_Free((*win_ptr)->lock_entry_pool_start);
+        MPIU_Free((*win_ptr)->lock_entry_pool_start);
     }
     MPIU_Assert((*win_ptr)->current_lock_data_bytes == 0);
 
diff --git a/test/mpi/rma/atomic_rmw_cas.c b/test/mpi/rma/atomic_rmw_cas.c
index 2b9a711..7db1b53 100644
--- a/test/mpi/rma/atomic_rmw_cas.c
+++ b/test/mpi/rma/atomic_rmw_cas.c
@@ -25,7 +25,8 @@
 #define LOOP_SIZE 10000
 #define CHECK_TAG 123
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[])
+{
     int rank, size, i, j, k;
     int errors = 0;
     int origin_shm, origin_am, dest;
@@ -54,10 +55,9 @@ int main (int argc, char *argv[]) {
         MPI_Alloc_mem(sizeof(int), MPI_INFO_NULL, &compare_buf);
     }
 
-    MPI_Win_allocate(sizeof(int), sizeof(int), MPI_INFO_NULL,
-                     MPI_COMM_WORLD, &target_buf, &win);
+    MPI_Win_allocate(sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &target_buf, &win);
 
-    for (k = 0; k < LOOP_SIZE; k++)  {
+    for (k = 0; k < LOOP_SIZE; k++) {
 
         /* init buffers */
         if (rank == origin_shm) {
@@ -96,14 +96,18 @@ int main (int argc, char *argv[]) {
             MPI_Alloc_mem(sizeof(int) * 3, MPI_INFO_NULL, &check_buf);
             MPI_Gather(target_buf, 1, MPI_INT, check_buf, 1, MPI_INT, dest, MPI_COMM_WORLD);
 
-            if (!(check_buf[dest] == 0 && check_buf[origin_shm] == 0 && check_buf[origin_am] == 1) &&
-                !(check_buf[dest] == 1 && check_buf[origin_shm] == 0 && check_buf[origin_am] == 0)) {
+            if (!(check_buf[dest] == 0 && check_buf[origin_shm] == 0 && check_buf[origin_am] == 1)
+                && !(check_buf[dest] == 1 && check_buf[origin_shm] == 0 &&
+                     check_buf[origin_am] == 0)) {
 
-                printf("Wrong results: target result = %d, origin_shm result = %d, origin_am result = %d\n",
-                       check_buf[dest], check_buf[origin_shm], check_buf[origin_am]);
+                printf
+                    ("Wrong results: target result = %d, origin_shm result = %d, origin_am result = %d\n",
+                     check_buf[dest], check_buf[origin_shm], check_buf[origin_am]);
 
-                printf("Expected results (1): target result = 1, origin_shm result = 0, origin_am result = 0\n");
-                printf("Expected results (2): target result = 0, origin_shm result = 0, origin_am result = 1\n");
+                printf
+                    ("Expected results (1): target result = 1, origin_shm result = 0, origin_am result = 0\n");
+                printf
+                    ("Expected results (2): target result = 0, origin_shm result = 0, origin_am result = 1\n");
 
                 errors++;
             }
@@ -120,7 +124,7 @@ int main (int argc, char *argv[]) {
         MPI_Free_mem(compare_buf);
     }
 
- exit_test:
+  exit_test:
     if (rank == dest && errors == 0)
         printf(" No Errors\n");
 
diff --git a/test/mpi/rma/atomic_rmw_fop.c b/test/mpi/rma/atomic_rmw_fop.c
index 873efe8..f30318c 100644
--- a/test/mpi/rma/atomic_rmw_fop.c
+++ b/test/mpi/rma/atomic_rmw_fop.c
@@ -23,7 +23,8 @@
 #define LOOP_SIZE 15
 #define CHECK_TAG 123
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[])
+{
     int rank, size, i, j, k;
     int errors = 0, all_errors = 0;
     int origin_shm, origin_am, dest;
@@ -46,8 +47,10 @@ int main (int argc, char *argv[]) {
     origin_shm = 0;
     origin_am = 1;
 
-    if (rank == origin_am) my_buf_size = AM_BUF_SIZE;
-    else if (rank == origin_shm) my_buf_size = SHM_BUF_SIZE;
+    if (rank == origin_am)
+        my_buf_size = AM_BUF_SIZE;
+    else if (rank == origin_shm)
+        my_buf_size = SHM_BUF_SIZE;
 
     if (rank != dest) {
         MPI_Alloc_mem(sizeof(int) * my_buf_size, MPI_INFO_NULL, &orig_buf);
@@ -57,15 +60,20 @@ int main (int argc, char *argv[]) {
     MPI_Win_allocate(sizeof(int) * WIN_BUF_SIZE, sizeof(int), MPI_INFO_NULL,
                      MPI_COMM_WORLD, &target_buf, &win);
 
-    for (k = 0; k < LOOP_SIZE; k++)  {
+    for (k = 0; k < LOOP_SIZE; k++) {
 
         /* init buffers */
         if (rank != dest) {
-            for (i = 0; i < my_buf_size; i++) {orig_buf[i] = 1; result_buf[i] = 0;}
+            for (i = 0; i < my_buf_size; i++) {
+                orig_buf[i] = 1;
+                result_buf[i] = 0;
+            }
         }
         else {
             MPI_Win_lock(MPI_LOCK_SHARED, rank, 0, win);
-            for (i = 0; i < WIN_BUF_SIZE; i++) {target_buf[i] = 0;}
+            for (i = 0; i < WIN_BUF_SIZE; i++) {
+                target_buf[i] = 0;
+            }
             MPI_Win_unlock(rank, win);
         }
 
@@ -90,12 +98,14 @@ int main (int argc, char *argv[]) {
             }
             else if (rank == origin_shm) {
                 MPI_Alloc_mem(sizeof(int) * AM_BUF_SIZE, MPI_INFO_NULL, &check_buf);
-                MPI_Recv(check_buf, AM_BUF_SIZE, MPI_INT, origin_am, CHECK_TAG, MPI_COMM_WORLD, &status);
+                MPI_Recv(check_buf, AM_BUF_SIZE, MPI_INT, origin_am, CHECK_TAG, MPI_COMM_WORLD,
+                         &status);
                 for (i = 0; i < AM_BUF_SIZE; i++) {
                     for (j = 0; j < SHM_BUF_SIZE; j++) {
                         if (check_buf[i] == result_buf[j]) {
-                            printf("LOOP=%d, rank=%d, FOP, both check_buf[%d] and result_buf[%d] equal to %d, expected to be different. \n",
-                                   k, rank, i, j, check_buf[i]);
+                            printf
+                                ("LOOP=%d, rank=%d, FOP, both check_buf[%d] and result_buf[%d] equal to %d, expected to be different. \n",
+                                 k, rank, i, j, check_buf[i]);
                             errors++;
                         }
                     }
@@ -107,7 +117,7 @@ int main (int argc, char *argv[]) {
             /* check results on P1 */
             if (target_buf[0] != AM_BUF_SIZE + SHM_BUF_SIZE) {
                 printf("LOOP=%d, rank=%d, FOP, target_buf[0] = %d, expected %d. \n",
-                       k, rank, target_buf[0], AM_BUF_SIZE+SHM_BUF_SIZE);
+                       k, rank, target_buf[0], AM_BUF_SIZE + SHM_BUF_SIZE);
                 errors++;
             }
         }
@@ -120,7 +130,7 @@ int main (int argc, char *argv[]) {
         MPI_Free_mem(result_buf);
     }
 
- exit_test:
+  exit_test:
     MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
 
     if (rank == 0 && all_errors == 0)
diff --git a/test/mpi/rma/atomic_rmw_gacc.c b/test/mpi/rma/atomic_rmw_gacc.c
index 8bd7582..5a59ec2 100644
--- a/test/mpi/rma/atomic_rmw_gacc.c
+++ b/test/mpi/rma/atomic_rmw_gacc.c
@@ -31,24 +31,29 @@ int rank, size;
 int dest, origin_shm, origin_am;
 int *orig_buf = NULL, *result_buf = NULL, *target_buf = NULL, *check_buf = NULL;
 
-void checkResults(int loop_k, int *errors) {
+void checkResults(int loop_k, int *errors)
+{
     int i, j, m;
     MPI_Status status;
 
     if (rank != dest) {
         /* check results on P0 and P2 (origin) */
         if (rank == origin_am) {
-            MPI_Send(result_buf, AM_BUF_NUM * OP_COUNT, MPI_INT, origin_shm, CHECK_TAG, MPI_COMM_WORLD);
+            MPI_Send(result_buf, AM_BUF_NUM * OP_COUNT, MPI_INT, origin_shm, CHECK_TAG,
+                     MPI_COMM_WORLD);
         }
         else if (rank == origin_shm) {
             MPI_Alloc_mem(sizeof(int) * AM_BUF_NUM * OP_COUNT, MPI_INFO_NULL, &check_buf);
-            MPI_Recv(check_buf, AM_BUF_NUM * OP_COUNT, MPI_INT, origin_am, CHECK_TAG, MPI_COMM_WORLD, &status);
+            MPI_Recv(check_buf, AM_BUF_NUM * OP_COUNT, MPI_INT, origin_am, CHECK_TAG,
+                     MPI_COMM_WORLD, &status);
             for (i = 0; i < AM_BUF_NUM; i++) {
                 for (j = 0; j < SHM_BUF_NUM; j++) {
                     for (m = 0; m < OP_COUNT; m++) {
-                        if (check_buf[i*OP_COUNT+m] == result_buf[j*OP_COUNT+m]) {
-                            printf("LOOP=%d, rank=%d, FOP, both check_buf[%d] and result_buf[%d] equal to %d, expected to be different. \n",
-                                   loop_k, rank, i*OP_COUNT+m, j*OP_COUNT+m, check_buf[i*OP_COUNT+m]);
+                        if (check_buf[i * OP_COUNT + m] == result_buf[j * OP_COUNT + m]) {
+                            printf
+                                ("LOOP=%d, rank=%d, FOP, both check_buf[%d] and result_buf[%d] equal to %d, expected to be different. \n",
+                                 loop_k, rank, i * OP_COUNT + m, j * OP_COUNT + m,
+                                 check_buf[i * OP_COUNT + m]);
                             (*errors)++;
                         }
                     }
@@ -62,14 +67,15 @@ void checkResults(int loop_k, int *errors) {
         for (i = 0; i < OP_COUNT; i++) {
             if (target_buf[i] != AM_BUF_NUM + SHM_BUF_NUM) {
                 printf("LOOP=%d, rank=%d, FOP, target_buf[%d] = %d, expected %d. \n",
-                       loop_k, rank, i, target_buf[i], AM_BUF_NUM+SHM_BUF_NUM);
+                       loop_k, rank, i, target_buf[i], AM_BUF_NUM + SHM_BUF_NUM);
                 (*errors)++;
             }
         }
     }
 }
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[])
+{
     int i, j, k;
     int errors = 0, all_errors = 0;
     int my_buf_num;
@@ -95,8 +101,10 @@ int main (int argc, char *argv[]) {
     origin_shm = 0;
     origin_am = 1;
 
-    if (rank == origin_am) my_buf_num = AM_BUF_NUM;
-    else if (rank == origin_shm) my_buf_num = SHM_BUF_NUM;
+    if (rank == origin_am)
+        my_buf_num = AM_BUF_NUM;
+    else if (rank == origin_shm)
+        my_buf_num = SHM_BUF_NUM;
 
     if (rank != dest) {
         MPI_Alloc_mem(sizeof(int) * my_buf_num * OP_COUNT, MPI_INFO_NULL, &orig_buf);
@@ -106,17 +114,22 @@ int main (int argc, char *argv[]) {
     MPI_Win_allocate(sizeof(int) * WIN_BUF_NUM * OP_COUNT, sizeof(int), MPI_INFO_NULL,
                      MPI_COMM_WORLD, &target_buf, &win);
 
-    for (k = 0; k < LOOP_SIZE; k++)  {
+    for (k = 0; k < LOOP_SIZE; k++) {
 
         /* ====== Part 1: test basic datatypes ======== */
 
         /* init buffers */
         if (rank != dest) {
-            for (i = 0; i < my_buf_num * OP_COUNT; i++) {orig_buf[i] = 1; result_buf[i] = 0;}
+            for (i = 0; i < my_buf_num * OP_COUNT; i++) {
+                orig_buf[i] = 1;
+                result_buf[i] = 0;
+            }
         }
         else {
             MPI_Win_lock(MPI_LOCK_SHARED, rank, 0, win);
-            for (i = 0; i < WIN_BUF_NUM * OP_COUNT; i++) {target_buf[i] = 0;}
+            for (i = 0; i < WIN_BUF_NUM * OP_COUNT; i++) {
+                target_buf[i] = 0;
+            }
             MPI_Win_unlock(rank, win);
         }
 
@@ -125,8 +138,8 @@ int main (int argc, char *argv[]) {
         MPI_Win_lock_all(0, win);
         if (rank != dest) {
             for (i = 0; i < my_buf_num; i++) {
-                MPI_Get_accumulate(&(orig_buf[i*OP_COUNT]), OP_COUNT, MPI_INT,
-                                   &(result_buf[i*OP_COUNT]), OP_COUNT, MPI_INT,
+                MPI_Get_accumulate(&(orig_buf[i * OP_COUNT]), OP_COUNT, MPI_INT,
+                                   &(result_buf[i * OP_COUNT]), OP_COUNT, MPI_INT,
                                    dest, 0, OP_COUNT, MPI_INT, MPI_SUM, win);
                 MPI_Win_flush(dest, win);
             }
@@ -141,11 +154,16 @@ int main (int argc, char *argv[]) {
 
         /* init buffers */
         if (rank != dest) {
-            for (i = 0; i < my_buf_num * OP_COUNT; i++) {orig_buf[i] = 1; result_buf[i] = 0;}
+            for (i = 0; i < my_buf_num * OP_COUNT; i++) {
+                orig_buf[i] = 1;
+                result_buf[i] = 0;
+            }
         }
         else {
             MPI_Win_lock(MPI_LOCK_SHARED, rank, 0, win);
-            for (i = 0; i < WIN_BUF_NUM * OP_COUNT; i++) {target_buf[i] = 0;}
+            for (i = 0; i < WIN_BUF_NUM * OP_COUNT; i++) {
+                target_buf[i] = 0;
+            }
             MPI_Win_unlock(rank, win);
         }
 
@@ -154,8 +172,8 @@ int main (int argc, char *argv[]) {
         MPI_Win_lock_all(0, win);
         if (rank != dest) {
             for (i = 0; i < my_buf_num; i++) {
-                MPI_Get_accumulate(&(orig_buf[i*OP_COUNT]), 1, origin_dtp,
-                                   &(result_buf[i*OP_COUNT]), 1, origin_dtp,
+                MPI_Get_accumulate(&(orig_buf[i * OP_COUNT]), 1, origin_dtp,
+                                   &(result_buf[i * OP_COUNT]), 1, origin_dtp,
                                    dest, 0, 1, target_dtp, MPI_SUM, win);
                 MPI_Win_flush(dest, win);
             }
@@ -170,11 +188,16 @@ int main (int argc, char *argv[]) {
 
         /* init buffers */
         if (rank != dest) {
-            for (i = 0; i < my_buf_num * OP_COUNT; i++) {orig_buf[i] = 1; result_buf[i] = 0;}
+            for (i = 0; i < my_buf_num * OP_COUNT; i++) {
+                orig_buf[i] = 1;
+                result_buf[i] = 0;
+            }
         }
         else {
             MPI_Win_lock(MPI_LOCK_SHARED, rank, 0, win);
-            for (i = 0; i < WIN_BUF_NUM * OP_COUNT; i++) {target_buf[i] = 0;}
+            for (i = 0; i < WIN_BUF_NUM * OP_COUNT; i++) {
+                target_buf[i] = 0;
+            }
             MPI_Win_unlock(rank, win);
         }
 
@@ -183,8 +206,8 @@ int main (int argc, char *argv[]) {
         MPI_Win_lock_all(0, win);
         if (rank != dest) {
             for (i = 0; i < my_buf_num; i++) {
-                MPI_Get_accumulate(&(orig_buf[i*OP_COUNT]), OP_COUNT, MPI_INT,
-                                   &(result_buf[i*OP_COUNT]), OP_COUNT, MPI_INT,
+                MPI_Get_accumulate(&(orig_buf[i * OP_COUNT]), OP_COUNT, MPI_INT,
+                                   &(result_buf[i * OP_COUNT]), OP_COUNT, MPI_INT,
                                    dest, 0, 1, target_dtp, MPI_SUM, win);
                 MPI_Win_flush(dest, win);
             }
@@ -199,11 +222,16 @@ int main (int argc, char *argv[]) {
 
         /* init buffers */
         if (rank != dest) {
-            for (i = 0; i < my_buf_num * OP_COUNT; i++) {orig_buf[i] = 1; result_buf[i] = 0;}
+            for (i = 0; i < my_buf_num * OP_COUNT; i++) {
+                orig_buf[i] = 1;
+                result_buf[i] = 0;
+            }
         }
         else {
             MPI_Win_lock(MPI_LOCK_SHARED, rank, 0, win);
-            for (i = 0; i < WIN_BUF_NUM * OP_COUNT; i++) {target_buf[i] = 0;}
+            for (i = 0; i < WIN_BUF_NUM * OP_COUNT; i++) {
+                target_buf[i] = 0;
+            }
             MPI_Win_unlock(rank, win);
         }
 
@@ -212,8 +240,8 @@ int main (int argc, char *argv[]) {
         MPI_Win_lock_all(0, win);
         if (rank != dest) {
             for (i = 0; i < my_buf_num; i++) {
-                MPI_Get_accumulate(&(orig_buf[i*OP_COUNT]), 1, origin_dtp,
-                                   &(result_buf[i*OP_COUNT]), 1, origin_dtp,
+                MPI_Get_accumulate(&(orig_buf[i * OP_COUNT]), 1, origin_dtp,
+                                   &(result_buf[i * OP_COUNT]), 1, origin_dtp,
                                    dest, 0, OP_COUNT, MPI_INT, MPI_SUM, win);
                 MPI_Win_flush(dest, win);
             }
@@ -235,7 +263,7 @@ int main (int argc, char *argv[]) {
     MPI_Type_free(&origin_dtp);
     MPI_Type_free(&target_dtp);
 
- exit_test:
+  exit_test:
     MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
 
     if (rank == 0 && all_errors == 0)

-----------------------------------------------------------------------

Summary of changes:
 src/binding/fortran/use_mpi/create_f90_util.c      |    6 +-
 src/mpi/datatype/get_elements_x.c                  |   16 +-
 .../ch3/channels/nemesis/include/mpid_nem_inline.h |    1 -
 .../ch3/channels/nemesis/include/mpid_nem_pre.h    |    5 +-
 src/mpid/ch3/channels/nemesis/netmod/ib/ib_send.c  |    5 +-
 .../ch3/channels/nemesis/netmod/mxm/mxm_send.c     |    7 +-
 .../channels/nemesis/netmod/newmad/newmad_send.c   |    7 +-
 src/mpid/ch3/channels/nemesis/netmod/ofi/ofi_msg.c |    8 +-
 .../ch3/channels/nemesis/netmod/portals4/ptl_nm.c  |   18 +-
 src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c    |   86 +-
 src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c    |  649 +++++++----
 src/mpid/ch3/channels/sock/src/ch3_win_fns.c       |   18 +
 src/mpid/ch3/include/mpid_rma_issue.h              | 1091 +++++++++++++----
 src/mpid/ch3/include/mpid_rma_lockqueue.h          |   12 +-
 src/mpid/ch3/include/mpid_rma_oplist.h             |  163 ++-
 src/mpid/ch3/include/mpid_rma_shm.h                |  372 +++---
 src/mpid/ch3/include/mpid_rma_types.h              |   23 +-
 src/mpid/ch3/include/mpidimpl.h                    |   47 +-
 src/mpid/ch3/include/mpidpkt.h                     |   94 +-
 src/mpid/ch3/include/mpidpre.h                     |   17 +-
 src/mpid/ch3/include/mpidrma.h                     |  481 +++++---
 src/mpid/ch3/src/ch3u_handle_op_req.c              |    4 +-
 src/mpid/ch3/src/ch3u_handle_recv_req.c            | 1308 +++++++++++---------
 src/mpid/ch3/src/ch3u_handle_send_req.c            |  220 ++--
 src/mpid/ch3/src/ch3u_request.c                    |   11 +-
 src/mpid/ch3/src/ch3u_rma_oplist.c                 |  282 +++--
 src/mpid/ch3/src/ch3u_rma_ops.c                    |  333 ++++--
 src/mpid/ch3/src/ch3u_rma_pkthandler.c             | 1088 +++++++++--------
 src/mpid/ch3/src/ch3u_rma_sync.c                   |  314 +++---
 src/mpid/ch3/src/ch3u_win_fns.c                    |  222 ++--
 src/mpid/ch3/src/mpid_init.c                       |    2 +
 src/mpid/ch3/src/mpid_rma.c                        |   56 +-
 src/mpid/ch3/src/mpidi_printf.c                    |  488 ++++----
 src/mpid/ch3/src/mpidi_rma.c                       |   64 +-
 src/mpid/common/datatype/mpid_dataloop.h           |    4 +-
 src/mpid/common/datatype/mpid_datatype.h           |   33 +-
 src/mpid/common/datatype/mpid_type_blockindexed.c  |   16 +-
 src/mpid/common/datatype/mpid_type_contiguous.c    |   16 +-
 .../common/datatype/mpid_type_create_pairtype.c    |   34 +-
 .../common/datatype/mpid_type_create_resized.c     |   12 +-
 src/mpid/common/datatype/mpid_type_debug.c         |    6 +-
 src/mpid/common/datatype/mpid_type_dup.c           |    6 +-
 src/mpid/common/datatype/mpid_type_indexed.c       |   20 +-
 src/mpid/common/datatype/mpid_type_struct.c        |   14 +-
 src/mpid/common/datatype/mpid_type_vector.c        |   16 +-
 src/mpid/common/datatype/mpid_type_zerolen.c       |    6 +-
 src/mpid/common/hcoll/hcoll_rte.c                  |    6 +-
 test/mpi/rma/Makefile.am                           |    3 +-
 test/mpi/rma/acc-pairtype.c                        |   87 ++
 test/mpi/rma/atomic_rmw_cas.c                      |   26 +-
 test/mpi/rma/atomic_rmw_fop.c                      |   32 +-
 test/mpi/rma/atomic_rmw_gacc.c                     |   84 +-
 test/mpi/rma/testlist.in                           |    1 +
 53 files changed, 4707 insertions(+), 3233 deletions(-)
 create mode 100644 test/mpi/rma/acc-pairtype.c


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list