[mpich-commits] [mpich] MPICH primary repository branch, master, updated. v3.0.4-403-gcd5986f

mysql vizuser noreply at mpich.org
Sun Jul 28 16:53:29 CDT 2013


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "MPICH primary repository".

The branch, master has been updated
       via  cd5986fc1cfaa68fb4abb6c6245c82e319be77ee (commit)
       via  f038a277715f77c3b746215c6ac527cb1ddfebad (commit)
       via  f0e27d1d65541d4a69a1152cd318a72e8c038bfa (commit)
       via  62cd51ca10890e27bcb13491d876c9d610b52876 (commit)
       via  5467e5004174421ae581dd08253adfe5c86a0000 (commit)
       via  447c1d5585135a41fd6341dcb9760ba945261f11 (commit)
       via  34dd7c1ad295d223119dbad3d032e38fa7f94c8e (commit)
       via  55cb0f4c1cdc5c1e7017d487bc2f13871dcf766e (commit)
       via  b668bc1ed3168790d8361ff14e1976581349cb81 (commit)
       via  384d96b7b7f193963a178e67ebb64d750751f3cc (commit)
       via  44d9c98dc934631af208c62113703c135bd86064 (commit)
       via  382bdfb2a2d35025b906033bc63fea98e3c551e3 (commit)
       via  e397148092dc2377b0671ebb22238a1a3456ea90 (commit)
       via  5c6b971af446d9ffaafdb35da449c78b82b92d8e (commit)
      from  d698823cf1db9bcacb69b1456953c06ec074a460 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.mpich.org/mpich.git/commitdiff/cd5986fc1cfaa68fb4abb6c6245c82e319be77ee

commit cd5986fc1cfaa68fb4abb6c6245c82e319be77ee
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Wed Jul 24 22:47:45 2013 -0500

    Set alloc_shared_noncontig to 1 by default if alloc_shm is passed to MPI_Win_allocate.
    
    Because shared memory region is not exposed to the user but just an runtime
    optimization, it is not useful to allocate a contiguous memory region.
    
    If user set alloc_shared_noncontig to "false", alloc_shared_noncontig will be set to 0.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
index 6b4fdce..c867138 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
@@ -62,15 +62,22 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHARED);
 
+    /* If create flavor is MPI_WIN_FLAVOR_ALLOCATE, alloc_shared_noncontig is set to 1 by default. */
+    if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE)
+        (*win_ptr)->info_args.alloc_shared_noncontig = 1;
+
     /* Check if we are allowed to allocate space non-contiguously */
     if (info != NULL) {
         int alloc_shared_nctg_flag = 0;
         char alloc_shared_nctg_value[MPI_MAX_INFO_VAL+1];
         MPIR_Info_get_impl(info, "alloc_shared_noncontig", MPI_MAX_INFO_VAL,
                            alloc_shared_nctg_value, &alloc_shared_nctg_flag);
-        if ((alloc_shared_nctg_flag == 1) && (!strncmp(alloc_shared_nctg_value, "true",
-                                                       strlen("true"))))
-            (*win_ptr)->info_args.alloc_shared_noncontig = 1;
+        if ((alloc_shared_nctg_flag == 1)) {
+            if (!strncmp(alloc_shared_nctg_value, "true", strlen("true")))
+                (*win_ptr)->info_args.alloc_shared_noncontig = 1;
+            if (!strncmp(alloc_shared_nctg_value, "false", strlen("false")))
+                (*win_ptr)->info_args.alloc_shared_noncontig = 0;
+        }
     }
 
     /* see if we can allocate all windows contiguously */

http://git.mpich.org/mpich.git/commitdiff/f038a277715f77c3b746215c6ac527cb1ddfebad

commit f038a277715f77c3b746215c6ac527cb1ddfebad
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Jul 22 13:02:16 2013 -0500

    add alloc_mem test in win_info.c test.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/test/mpi/rma/win_info.c b/test/mpi/rma/win_info.c
index 44286a9..3a2222d 100644
--- a/test/mpi/rma/win_info.c
+++ b/test/mpi/rma/win_info.c
@@ -57,6 +57,9 @@ int main(int argc, char **argv) {
     MPI_Info_get(info_out, "same_size", MPI_MAX_INFO_VAL, buf, &flag);
     if (flag && VERBOSE) printf("%d: same_size = %s\n", rank, buf);
 
+    MPI_Info_get(info_out, "alloc_shm", MPI_MAX_INFO_VAL, buf, &flag);
+    if (flag && VERBOSE) printf("%d: alloc_shm = %s\n", rank, buf);
+
     MPI_Info_free(&info_in);
     MPI_Info_free(&info_out);
     MPI_Win_free(&win);

http://git.mpich.org/mpich.git/commitdiff/f0e27d1d65541d4a69a1152cd318a72e8c038bfa

commit f0e27d1d65541d4a69a1152cd318a72e8c038bfa
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Jul 22 13:01:49 2013 -0500

    add alloc_mem test in req_example.c test.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/test/mpi/rma/Makefile.am b/test/mpi/rma/Makefile.am
index 2ccccb0..8c88221 100644
--- a/test/mpi/rma/Makefile.am
+++ b/test/mpi/rma/Makefile.am
@@ -111,6 +111,7 @@ noinst_PROGRAMS =          \
     flush                  \
     reqops                 \
     req_example            \
+    req_example_shm        \
     win_info               \
     pscw_ordering          \
     mutex_bench            \
@@ -160,6 +161,9 @@ get_accumulate_long_derived_SOURCES   = get_accumulate.c
 get_accumulate_double_derived_SOURCES = get_accumulate.c
 get_accumulate_int_derived_SOURCES      = get_accumulate.c
 
+req_example_shm_CPPFLAGS         = -DUSE_WIN_ALLOC_SHM $(AM_CPPFLAGS)
+req_example_shm_SOURCES          = req_example.c
+
 mutex_bench_SOURCES              = mutex_bench.c mcs-mutex.c mcs-mutex.h
 mutex_bench_shared_CPPFLAGS      = -DUSE_WIN_SHARED $(AM_CPPFLAGS)
 mutex_bench_shared_SOURCES       = mutex_bench.c mcs-mutex.c mcs-mutex.h
diff --git a/test/mpi/rma/req_example.c b/test/mpi/rma/req_example.c
index 571325c..5788972 100644
--- a/test/mpi/rma/req_example.c
+++ b/test/mpi/rma/req_example.c
@@ -42,6 +42,7 @@ int main( int argc, char *argv[] )
     MPI_Request get_req;
     double *baseptr;
     double data[M][N]; /* M buffers of length N */
+    MPI_Info win_info;
 
     MPI_Init(&argc, &argv);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@@ -49,7 +50,15 @@ int main( int argc, char *argv[] )
 
     assert(M < NSTEPS);
 
-    MPI_Win_allocate(NSTEPS*N*sizeof(double), sizeof(double), MPI_INFO_NULL,
+    MPI_Info_create(&win_info);
+
+#ifdef USE_WIN_ALLOC_SHM
+    MPI_Info_set(win_info, "alloc_shm", "true");
+#else
+    MPI_Info_set(win_info, "alloc_shm", "false");
+#endif
+
+    MPI_Win_allocate(NSTEPS*N*sizeof(double), sizeof(double), win_info,
                      MPI_COMM_WORLD, &baseptr, &win);
 
     MPI_Win_lock_all(0, win);
@@ -80,6 +89,8 @@ int main( int argc, char *argv[] )
 
     MPI_Win_free(&win);
 
+    MPI_Info_free(&win_info);
+
     MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
 
     if (rank == 0 && all_errors == 0)
diff --git a/test/mpi/rma/testlist b/test/mpi/rma/testlist
index 522466c..d69d4ed 100644
--- a/test/mpi/rma/testlist
+++ b/test/mpi/rma/testlist
@@ -91,6 +91,7 @@ get_accumulate_short_derived 4 mpiversion=3.0
 flush 4 mpiversion=3.0
 reqops 4 mpiversion=3.0
 req_example 4 mpiversion=3.0
+req_example_shm 4 mpiversion=3.0
 win_info 4 mpiversion=3.0
 linked_list_lockall 4 mpiversion=3.0
 pscw_ordering 4 mpiversion=3.0

http://git.mpich.org/mpich.git/commitdiff/62cd51ca10890e27bcb13491d876c9d610b52876

commit 62cd51ca10890e27bcb13491d876c9d610b52876
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Jul 22 13:01:20 2013 -0500

    add alloc_shm test in mcs-mutex.c test.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/test/mpi/rma/Makefile.am b/test/mpi/rma/Makefile.am
index 1c33a1b..2ccccb0 100644
--- a/test/mpi/rma/Makefile.am
+++ b/test/mpi/rma/Makefile.am
@@ -115,6 +115,7 @@ noinst_PROGRAMS =          \
     pscw_ordering          \
     mutex_bench            \
     mutex_bench_shared     \
+    mutex_bench_shm        \
     rma-contig             \
     badrma
 
@@ -162,6 +163,8 @@ get_accumulate_int_derived_SOURCES      = get_accumulate.c
 mutex_bench_SOURCES              = mutex_bench.c mcs-mutex.c mcs-mutex.h
 mutex_bench_shared_CPPFLAGS      = -DUSE_WIN_SHARED $(AM_CPPFLAGS)
 mutex_bench_shared_SOURCES       = mutex_bench.c mcs-mutex.c mcs-mutex.h
+mutex_bench_shm_CPPFLAGS         = -DUSE_WIN_ALLOC_SHM $(AM_CPPFLAGS)
+mutex_bench_shm_SOURCES          = mutex_bench.c mcs-mutex.c mcs-mutex.h
 
 linked_list_bench_lock_shr_nocheck_SOURCES  = linked_list_bench_lock_shr.c
 linked_list_bench_lock_shr_nocheck_CPPFLAGS = -DUSE_MODE_NOCHECK $(AM_CPPFLAGS)
diff --git a/test/mpi/rma/mcs-mutex.c b/test/mpi/rma/mcs-mutex.c
index 533fbdb..835bddf 100644
--- a/test/mpi/rma/mcs-mutex.c
+++ b/test/mpi/rma/mcs-mutex.c
@@ -42,9 +42,16 @@ int MCS_Mutex_create(int tail_rank, MPI_Comm comm, MCS_Mutex * hdl_out)
     MPI_Win_allocate_shared(2*sizeof(int), sizeof(int), MPI_INFO_NULL,
                             hdl->comm, &hdl->base, &hdl->window);
 #else
+#ifdef USE_WIN_ALLOC_SHM
+    MPI_Info_create(&hdl->win_info);
+    MPI_Info_set(hdl->win_info, "alloc_shm", "true");
+    MPI_Win_allocate(2*sizeof(int), sizeof(int), hdl->win_info, hdl->comm,
+                     &hdl->base, &hdl->window);
+#else
     MPI_Win_allocate(2*sizeof(int), sizeof(int), MPI_INFO_NULL, hdl->comm,
                      &hdl->base, &hdl->window);
 #endif
+#endif
 
     MPI_Win_lock_all(0, hdl->window);
 
@@ -73,6 +80,9 @@ int MCS_Mutex_free(MCS_Mutex * hdl_ptr)
 
     MPI_Win_free(&hdl->window);
     MPI_Comm_free(&hdl->comm);
+#ifdef USE_WIN_ALLOC_SHM
+    MPI_Info_free(&hdl->win_info);
+#endif
 
     free(hdl);
     hdl_ptr = NULL;
diff --git a/test/mpi/rma/mcs-mutex.h b/test/mpi/rma/mcs-mutex.h
index c8d8843..e67fc5d 100644
--- a/test/mpi/rma/mcs-mutex.h
+++ b/test/mpi/rma/mcs-mutex.h
@@ -22,6 +22,7 @@ struct mcs_mutex_s {
     MPI_Comm comm;
     MPI_Win window;
     int *base;
+    MPI_Info win_info;
 };
 
 typedef struct mcs_mutex_s * MCS_Mutex;
diff --git a/test/mpi/rma/testlist b/test/mpi/rma/testlist
index 4da3c0f..522466c 100644
--- a/test/mpi/rma/testlist
+++ b/test/mpi/rma/testlist
@@ -100,5 +100,6 @@ linked_list_bench_lock_shr 4 mpiversion=3.0
 linked_list_bench_lock_shr_nocheck 4 mpiversion=3.0
 mutex_bench 4 mpiversion=3.0
 mutex_bench_shared 4 mpiversion=3.0
+mutex_bench_shm 4 mpiversion=3.0
 rma-contig 4 mpiversion=3.0 timeLimit=600
 badrma 2 mpiversion=3.0

http://git.mpich.org/mpich.git/commitdiff/5467e5004174421ae581dd08253adfe5c86a0000

commit 5467e5004174421ae581dd08253adfe5c86a0000
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Jul 22 12:59:18 2013 -0500

    do SHM_MUTEX_LOCK/UNLOCK in packet handlers on target.
    
    If "alloc_shm" is set, it may happen that the target process is doing a RMA operation
    from a remote process concurrently with a local process is also doing a RMA operation
    on the same target and on overlapping memory location.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_handle_recv_req.c b/src/mpid/ch3/src/ch3u_handle_recv_req.c
index 872cfb8..3425aa5 100644
--- a/src/mpid/ch3/src/ch3u_handle_recv_req.c
+++ b/src/mpid/ch3/src/ch3u_handle_recv_req.c
@@ -142,16 +142,22 @@ int MPIDI_CH3_ReqHandler_PutAccumRespComplete( MPIDI_VC_t *vc,
         rreq->dev.resp_request_handle = MPI_REQUEST_NULL;
     }
 
+    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
+
     if (MPIDI_Request_get_type(rreq) == MPIDI_REQUEST_TYPE_ACCUM_RESP) {
+
+	if (win_ptr->shm_allocated == TRUE)
+	    MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 	/* accumulate data from tmp_buf into user_buf */
 	mpi_errno = do_accumulate_op(rreq);
+	if (win_ptr->shm_allocated == TRUE)
+	    MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+
 	if (mpi_errno) {
 	    MPIU_ERR_POP(mpi_errno);
 	}
     }
     
-    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
-    
     mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
                                                rreq->dev.source_win_handle);
     if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
@@ -430,7 +436,11 @@ int MPIDI_CH3_ReqHandler_SinglePutAccumComplete( MPIDI_VC_t *vc,
 				       lock_queue_entry->pt_single_op->datatype);
 	}
 	else {
+	    if (win_ptr->shm_allocated == TRUE)
+		MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 	    mpi_errno = do_simple_accumulate(lock_queue_entry->pt_single_op);
+	    if (win_ptr->shm_allocated == TRUE)
+		MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 	}
 	
 	if (mpi_errno) {
@@ -510,12 +520,18 @@ int MPIDI_CH3_ReqHandler_FOPComplete( MPIDI_VC_t *vc,
         MPIU_Memcpy( resp_req->dev.tmpbuf, rreq->dev.real_user_buf, len );
     }
 
+    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
+
     /* Apply the op */
     if (rreq->dev.op != MPI_NO_OP) {
         uop = MPIR_OP_HDL_TO_FN(rreq->dev.op);
         one = 1;
 
+        if (win_ptr->shm_allocated == TRUE)
+            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
         (*uop)(rreq->dev.user_buf, rreq->dev.real_user_buf, &one, &rreq->dev.datatype);
+        if (win_ptr->shm_allocated == TRUE)
+            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
     }
 
     /* Send back the original data.  We do this here to ensure that the
@@ -552,8 +568,6 @@ int MPIDI_CH3_ReqHandler_FOPComplete( MPIDI_VC_t *vc,
     /* There are additional steps to take if this is a passive 
        target RMA or the last operation from the source */
 
-    MPID_Win_get_ptr(rreq->dev.target_win_handle, win_ptr);
-
     mpi_errno = MPIDI_CH3_Finish_rma_op_target(vc, win_ptr, TRUE, rreq->dev.flags,
                                                rreq->dev.source_win_handle);
     if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
@@ -958,7 +972,11 @@ int MPIDI_CH3I_Release_lock(MPID_Win *win_ptr)
 							   single_op->datatype);
 			    }   
 			    else if (single_op->type == MPIDI_RMA_ACCUMULATE) {
+				if (win_ptr->shm_allocated == TRUE)
+				    MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 				mpi_errno = do_simple_accumulate(single_op);
+				if (win_ptr->shm_allocated == TRUE)
+				    MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 			    }
 			    else if (single_op->type == MPIDI_RMA_GET) {
 				mpi_errno = do_simple_get(win_ptr, lock_queue);
diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index 4eb0834..df097d0 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -3900,6 +3900,8 @@ int MPIDI_CH3_PktHandler_Accumulate_Immed( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
     }
     else {
 	MPIU_INSTR_DURATION_START(rmapkt_acc_immed_op);
+	if (win_ptr->shm_allocated == TRUE)
+	    MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
 	/* Data is already present */
 	if (accum_pkt->op == MPI_REPLACE) {
 	    /* no datatypes required */
@@ -3920,6 +3922,8 @@ int MPIDI_CH3_PktHandler_Accumulate_Immed( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
 				     "**opnotpredefined %d", accum_pkt->op );
 	    }
 	}
+	if (win_ptr->shm_allocated == TRUE)
+	    MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
 	MPIU_INSTR_DURATION_END(rmapkt_acc_immed_op);
 	
 
@@ -3982,8 +3986,20 @@ int MPIDI_CH3_PktHandler_CAS( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
     /* Copy old value into the response packet */
     MPID_Datatype_get_size_macro(cas_pkt->datatype, len);
     MPIU_Assert(len <= sizeof(MPIDI_CH3_CAS_Immed_u));
+
+    if (win_ptr->shm_allocated == TRUE)
+        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+
     MPIU_Memcpy( (void *)&cas_resp_pkt->data, cas_pkt->addr, len );
 
+    /* Compare and replace if equal */
+    if (MPIR_Compare_equal(&cas_pkt->compare_data, cas_pkt->addr, cas_pkt->datatype)) {
+        MPIU_Memcpy(cas_pkt->addr, &cas_pkt->origin_data, len);
+    }
+
+    if (win_ptr->shm_allocated == TRUE)
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+
     /* Send the response packet */
     MPIU_THREAD_CS_ENTER(CH3COMM,vc);
     mpi_errno = MPIDI_CH3_iStartMsg(vc, cas_resp_pkt, sizeof(*cas_resp_pkt), &req);
@@ -3995,10 +4011,6 @@ int MPIDI_CH3_PktHandler_CAS( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt,
         MPID_Request_release(req);
     }
 
-    /* Compare and replace if equal */
-    if (MPIR_Compare_equal(&cas_pkt->compare_data, cas_pkt->addr, cas_pkt->datatype)) {
-        MPIU_Memcpy(cas_pkt->addr, &cas_pkt->origin_data, len);
-    }
 
     /* There are additional steps to take if this is a passive 
        target RMA or the last operation from the source */

http://git.mpich.org/mpich.git/commitdiff/447c1d5585135a41fd6341dcb9760ba945261f11

commit 447c1d5585135a41fd6341dcb9760ba945261f11
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Fri Jul 26 19:41:01 2013 -0500

    Delete decrementing ref count in SHM RMA operations, but add conditions in operaiton issue routines.
    
    In RMA operation issue routines, judge if shm_allocate == 1 and target vc
    is local, if so, do not add reference count on datatypes, because they will not
    be referenced by the progress engine, but will be completed directly by origin.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_acc_ops.c b/src/mpid/ch3/src/ch3u_rma_acc_ops.c
index 82b1d2d..0253095 100644
--- a/src/mpid/ch3/src/ch3u_rma_acc_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_acc_ops.c
@@ -71,6 +71,7 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
     else {
         MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
         MPIDI_RMA_Op_t *new_ptr = NULL;
+        MPIDI_VC_t *vc = NULL;
 
         /* Append the operation to the window's RMA ops queue */
         MPIU_INSTR_DURATION_START(rmaqueue_alloc);
@@ -97,19 +98,27 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
         new_ptr->op = op;
         MPIU_INSTR_DURATION_END(rmaqueue_set);
 
-        /* if source or target datatypes are derived, increment their
-           reference counts */
-        if (!origin_predefined) {
-            MPID_Datatype_get_ptr(origin_datatype, dtp);
-            MPID_Datatype_add_ref(dtp);
-        }
-        if (!result_predefined) {
-            MPID_Datatype_get_ptr(result_datatype, dtp);
-            MPID_Datatype_add_ref(dtp);
-        }
-        if (!target_predefined) {
-            MPID_Datatype_get_ptr(target_datatype, dtp);
-            MPID_Datatype_add_ref(dtp);
+	/* check if target is local and shared memory is allocated on window,
+	  if so, we do not need to increment reference counts on datatype. This is
+	  because this operation will be directly done on shared memory region, instead
+	  of sending and receiving through the progress engine, therefore datatype
+	  will not be referenced by the progress engine */
+        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &vc);
+        if (!(win_ptr->shm_allocated == TRUE && vc->ch.is_local)) {
+            /* if source or target datatypes are derived, increment their
+               reference counts */
+            if (!origin_predefined) {
+                MPID_Datatype_get_ptr(origin_datatype, dtp);
+                MPID_Datatype_add_ref(dtp);
+            }
+            if (!result_predefined) {
+                MPID_Datatype_get_ptr(result_datatype, dtp);
+                MPID_Datatype_add_ref(dtp);
+            }
+            if (!target_predefined) {
+                MPID_Datatype_get_ptr(target_datatype, dtp);
+                MPID_Datatype_add_ref(dtp);
+            }
         }
     }
 
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index b57283c..b7d6d49 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -190,6 +190,7 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
     {
         MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
         MPIDI_RMA_Op_t *new_ptr = NULL;
+        MPIDI_VC_t *vc = NULL;
 
 	/* queue it up */
         MPIU_INSTR_DURATION_START(rmaqueue_alloc);
@@ -212,20 +213,28 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
 	new_ptr->target_datatype = target_datatype;
 	MPIU_INSTR_DURATION_END(rmaqueue_set);
 
-	/* if source or target datatypes are derived, increment their
-	   reference counts */ 
-	MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, predefined);
-	if (!predefined)
-	{
-	    MPID_Datatype_get_ptr(origin_datatype, dtp);
-	    MPID_Datatype_add_ref(dtp);
-	}
-	MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, predefined);
-	if (!predefined)
-	{
-	    MPID_Datatype_get_ptr(target_datatype, dtp);
-	    MPID_Datatype_add_ref(dtp);
-	}
+	/* check if target is local and shared memory is allocated on window,
+	  if so, we do not need to increment reference counts on datatype. This is
+	  because this operation will be directly done on shared memory region, instead
+	  of sending and receiving through the progress engine, therefore datatype
+	  will not be referenced by the progress engine */
+	MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &vc);
+	if (!(win_ptr->shm_allocated == TRUE && vc->ch.is_local)) {
+	    /* if source or target datatypes are derived, increment their
+	       reference counts */
+	    MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, predefined);
+	    if (!predefined)
+	    {
+	        MPID_Datatype_get_ptr(origin_datatype, dtp);
+	        MPID_Datatype_add_ref(dtp);
+	    }
+	    MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, predefined);
+	    if (!predefined)
+	    {
+	        MPID_Datatype_get_ptr(target_datatype, dtp);
+	        MPID_Datatype_add_ref(dtp);
+	    }
+        }
     }
 
   fn_exit:
@@ -288,6 +297,7 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
     {
         MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
         MPIDI_RMA_Op_t *new_ptr = NULL;
+        MPIDI_VC_t *vc = NULL;
 
 	/* queue it up */
         MPIU_INSTR_DURATION_START(rmaqueue_alloc);
@@ -307,20 +317,28 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
 	new_ptr->target_datatype = target_datatype;
 	MPIU_INSTR_DURATION_END(rmaqueue_set);
 	
-	/* if source or target datatypes are derived, increment their
-	   reference counts */ 
-	MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, predefined);
-	if (!predefined)
-	{
-	    MPID_Datatype_get_ptr(origin_datatype, dtp);
-	    MPID_Datatype_add_ref(dtp);
-	}
-	MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, predefined);
-	if (!predefined)
-	{
-	    MPID_Datatype_get_ptr(target_datatype, dtp);
-	    MPID_Datatype_add_ref(dtp);
-	}
+	/* check if target is local and shared memory is allocated on window,
+	  if so, we do not need to increment reference counts on datatype. This is
+	  because this operation will be directly done on shared memory region, instead
+	  of sending and receiving through the progress engine, therefore datatype
+	  will not be referenced by the progress engine */
+	MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &vc);
+	if (!(win_ptr->shm_allocated == TRUE && vc->ch.is_local)) {
+	    /* if source or target datatypes are derived, increment their
+	       reference counts */
+	    MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, predefined);
+	    if (!predefined)
+	    {
+	        MPID_Datatype_get_ptr(origin_datatype, dtp);
+	        MPID_Datatype_add_ref(dtp);
+	    }
+	    MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, predefined);
+	    if (!predefined)
+	    {
+	        MPID_Datatype_get_ptr(target_datatype, dtp);
+	        MPID_Datatype_add_ref(dtp);
+	    }
+        }
     }
 
   fn_exit:
@@ -389,6 +407,7 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
     {
         MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
         MPIDI_RMA_Op_t *new_ptr = NULL;
+        MPIDI_VC_t *vc = NULL;
 
 	/* queue it up */
         MPIU_INSTR_DURATION_START(rmaqueue_alloc);
@@ -429,18 +448,26 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
 	new_ptr->op = op;
 	MPIU_INSTR_DURATION_END(rmaqueue_set);
 	
-	/* if source or target datatypes are derived, increment their
-	   reference counts */ 
-	if (!origin_predefined)
-	{
-	    MPID_Datatype_get_ptr(origin_datatype, dtp);
-	    MPID_Datatype_add_ref(dtp);
-	}
-	if (!target_predefined)
-	{
-	    MPID_Datatype_get_ptr(target_datatype, dtp);
-	    MPID_Datatype_add_ref(dtp);
-	}
+	/* check if target is local and shared memory is allocated on window,
+	  if so, we do not need to increment reference counts on datatype. This is
+	  because this operation will be directly done on shared memory region, instead
+	  of sending and receiving through the progress engine, therefore datatype
+	  will not be referenced by the progress engine */
+	MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &vc);
+	if (!(win_ptr->shm_allocated == TRUE && vc->ch.is_local)) {
+	    /* if source or target datatypes are derived, increment their
+	       reference counts */
+	    if (!origin_predefined)
+	    {
+	        MPID_Datatype_get_ptr(origin_datatype, dtp);
+	        MPID_Datatype_add_ref(dtp);
+	    }
+	    if (!target_predefined)
+	    {
+	        MPID_Datatype_get_ptr(target_datatype, dtp);
+	        MPID_Datatype_add_ref(dtp);
+	    }
+        }
     }
 
  fn_exit:
diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index 4852465..4eb0834 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -208,27 +208,11 @@ static int create_datatype(const MPIDI_RMA_dtype_info *dtype_info,
     do {                                                                                                                    \
     switch ((op_ptr_)->type)                                                                                                \
     {                                                                                                                       \
-        int predefined_;                                                                                                    \
-        MPID_Datatype *dtp_;                                                                                                \
         case (MPIDI_RMA_PUT):                                                                                               \
             (err_) = MPIDI_CH3I_Shm_put_op((op_ptr_)->origin_addr, (op_ptr_)->origin_count, (op_ptr_)->origin_datatype,     \
                                            (op_ptr_)->target_rank, (op_ptr_)->target_disp, (op_ptr_)->target_count,         \
                                            (op_ptr_)->target_datatype, (win_ptr_));                                         \
             if (err_) {MPIU_ERR_POP(err_);}                                                                                 \
-            /* if source or target datatypes are derived, decrement their                                                   \
-               reference counts because they will not be referenced by progress engine */                                   \
-            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->origin_datatype, predefined_);                                     \
-            if (!predefined_)                                                                                               \
-            {                                                                                                               \
-                MPID_Datatype_get_ptr((op_ptr_)->origin_datatype, dtp_);                                                    \
-                MPID_Datatype_release(dtp_);                                                                                \
-            }                                                                                                               \
-            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->target_datatype, predefined_);                                     \
-            if (!predefined_)                                                                                               \
-            {                                                                                                               \
-                MPID_Datatype_get_ptr((op_ptr_)->target_datatype, dtp_);                                                    \
-                MPID_Datatype_release(dtp_);                                                                                \
-            }                                                                                                               \
             break;                                                                                                          \
         case (MPIDI_RMA_ACCUMULATE):                                                                                        \
         case (MPIDI_RMA_ACC_CONTIG):                                                                                        \
@@ -236,20 +220,6 @@ static int create_datatype(const MPIDI_RMA_dtype_info *dtype_info,
                                            (op_ptr_)->target_rank, (op_ptr_)->target_disp, (op_ptr_)->target_count,         \
                                            (op_ptr_)->target_datatype, (op_ptr_)->op, (win_ptr_));                          \
             if (err_) {MPIU_ERR_POP(err_);}                                                                                 \
-            /* if source or target datatypes are derived, decrement their                                                   \
-               reference counts because they will not be referenced by progress engine */                                   \
-            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->origin_datatype, predefined_);                                     \
-            if (!predefined_)                                                                                               \
-            {                                                                                                               \
-                MPID_Datatype_get_ptr((op_ptr_)->origin_datatype, dtp_);                                                    \
-                MPID_Datatype_release(dtp_);                                                                                \
-            }                                                                                                               \
-            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->target_datatype, predefined_);                                     \
-            if (!predefined_)                                                                                               \
-            {                                                                                                               \
-                MPID_Datatype_get_ptr((op_ptr_)->target_datatype, dtp_);                                                    \
-                MPID_Datatype_release(dtp_);                                                                                \
-            }                                                                                                               \
             break;                                                                                                          \
         case (MPIDI_RMA_GET_ACCUMULATE):                                                                                    \
             (err_) = MPIDI_CH3I_Shm_get_acc_op((op_ptr_)->origin_addr, (op_ptr_)->origin_count, (op_ptr_)->origin_datatype, \
@@ -257,49 +227,12 @@ static int create_datatype(const MPIDI_RMA_dtype_info *dtype_info,
                                                (op_ptr_)->target_rank, (op_ptr_)->target_disp, (op_ptr_)->target_count,     \
                                                (op_ptr_)->target_datatype, (op_ptr_)->op, (win_ptr_));                      \
             if (err_) {MPIU_ERR_POP(err_);}                                                                                 \
-            /* if source or target datatypes are derived, decrement their                                                   \
-               reference counts because they will not be referenced by progress engine */                                   \
-            predefined_ = TRUE;                                                                                             \
-            if ((op_ptr_)->op != MPI_NO_OP) {                                                                               \
-                MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->origin_datatype, predefined_);                                 \
-            }                                                                                                               \
-            if (!predefined_)                                                                                               \
-            {                                                                                                               \
-                MPID_Datatype_get_ptr((op_ptr_)->origin_datatype, dtp_);                                                    \
-                MPID_Datatype_release(dtp_);                                                                                \
-            }                                                                                                               \
-            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->result_datatype, predefined_);                                     \
-            if (!predefined_)                                                                                               \
-            {                                                                                                               \
-                MPID_Datatype_get_ptr((op_ptr_)->result_datatype, dtp_);                                                    \
-                MPID_Datatype_release(dtp_);                                                                                \
-            }                                                                                                               \
-            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->target_datatype, predefined_);                                     \
-            if (!predefined_)                                                                                               \
-            {                                                                                                               \
-                MPID_Datatype_get_ptr((op_ptr_)->target_datatype, dtp_);                                                    \
-                MPID_Datatype_release(dtp_);                                                                                \
-            }                                                                                                               \
             break;                                                                                                          \
         case (MPIDI_RMA_GET):                                                                                               \
             (err_) = MPIDI_CH3I_Shm_get_op((op_ptr_)->origin_addr, (op_ptr_)->origin_count, (op_ptr_)->origin_datatype,     \
                                            (op_ptr_)->target_rank, (op_ptr_)->target_disp, (op_ptr_)->target_count,         \
                                            (op_ptr_)->target_datatype, (win_ptr_));                                         \
             if (err_) {MPIU_ERR_POP(err_);}                                                                                 \
-            /* if source or target datatypes are derived, decrement their                                                   \
-               reference counts because they will not be referenced by progress engine */                                   \
-            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->origin_datatype, predefined_);                                     \
-            if (!predefined_)                                                                                               \
-            {                                                                                                               \
-                MPID_Datatype_get_ptr((op_ptr_)->origin_datatype, dtp_);                                                    \
-                MPID_Datatype_release(dtp_);                                                                                \
-            }                                                                                                               \
-            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->target_datatype, predefined_);                                     \
-            if (!predefined_)                                                                                               \
-            {                                                                                                               \
-                MPID_Datatype_get_ptr((op_ptr_)->target_datatype, dtp_);                                                    \
-                MPID_Datatype_release(dtp_);                                                                                \
-            }                                                                                                               \
             break;                                                                                                          \
         case (MPIDI_RMA_COMPARE_AND_SWAP):                                                                                  \
             (err_) = MPIDI_CH3I_Shm_cas_op((op_ptr_)->origin_addr, (op_ptr_)->compare_addr, (op_ptr_)->result_addr,         \

http://git.mpich.org/mpich.git/commitdiff/34dd7c1ad295d223119dbad3d032e38fa7f94c8e

commit 34dd7c1ad295d223119dbad3d032e38fa7f94c8e
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Fri Jul 26 09:10:32 2013 -0500

    In SHM RMA operations, add condition for "alloc_shm".
    
    If shared memory is allocated for window, and target vc is local,
    do SHM RMA operations.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index ecb59c3..3221c6a 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -325,13 +325,17 @@ static inline int MPIDI_CH3I_Shm_put_op(const void *origin_addr, int origin_coun
                                         MPID_Win *win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
+    MPIDI_VC_t *vc = NULL;
     void *base = NULL;
     int disp_unit;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_PUT_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_PUT_OP);
 
-    if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+    MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &vc);
+
+    if ((win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) ||
+        (win_ptr->shm_allocated == TRUE && vc->ch.is_local)) {
         base = win_ptr->shm_base_addrs[target_rank];
         disp_unit = win_ptr->disp_units[target_rank];
     }
@@ -369,6 +373,7 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
     int origin_predefined, target_predefined;
     MPI_User_function *uop = NULL;
     MPID_Datatype *dtp;
+    MPIDI_VC_t *vc = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIU_CHKLMEM_DECL(2);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);
@@ -378,7 +383,10 @@ static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_coun
     MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined);
     MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined);
 
-    if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+    MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &vc);
+
+    if ((win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) ||
+        (win_ptr->shm_allocated == TRUE && vc->ch.is_local)) {
         shm_op = 1;
         base = win_ptr->shm_base_addrs[target_rank];
         disp_unit = win_ptr->disp_units[target_rank];
@@ -521,6 +529,7 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
     MPI_User_function *uop = NULL;
     MPID_Datatype *dtp;
     int origin_predefined, result_predefined, target_predefined;
+    MPIDI_VC_t *vc = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIU_CHKLMEM_DECL(2);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);
@@ -534,7 +543,10 @@ static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_
     MPIDI_CH3I_DATATYPE_IS_PREDEFINED(result_datatype, result_predefined);
     MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined);
 
-    if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+    MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &vc);
+
+    if ((win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) ||
+        (win_ptr->shm_allocated == TRUE && vc->ch.is_local)) {
         base = win_ptr->shm_base_addrs[target_rank];
         disp_unit = win_ptr->disp_units[target_rank];
         MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
@@ -690,12 +702,16 @@ static inline int MPIDI_CH3I_Shm_get_op(void *origin_addr, int origin_count, MPI
 {
     void *base = NULL;
     int disp_unit;
+    MPIDI_VC_t *vc = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_GET_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_GET_OP);
 
-    if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+    MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &vc);
+
+    if ((win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) ||
+        (win_ptr->shm_allocated == TRUE && vc->ch.is_local)) {
         base = win_ptr->shm_base_addrs[target_rank];
         disp_unit = win_ptr->disp_units[target_rank];
     }
@@ -730,12 +746,16 @@ static inline int MPIDI_CH3I_Shm_cas_op(const void *origin_addr, const void *com
     void *base = NULL, *dest_addr = NULL;
     int disp_unit;
     int len, shm_locked = 0;
+    MPIDI_VC_t *vc = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_CAS_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_CAS_OP);
 
-    if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+    MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &vc);
+
+    if ((win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) ||
+        (win_ptr->shm_allocated == TRUE && vc->ch.is_local)) {
         base = win_ptr->shm_base_addrs[target_rank];
         disp_unit = win_ptr->disp_units[target_rank];
 
@@ -786,12 +806,16 @@ static inline int MPIDI_CH3I_Shm_fop_op(const void *origin_addr, void *result_ad
     MPI_User_function *uop = NULL;
     int disp_unit;
     int len, one, shm_locked = 0;
+    MPIDI_VC_t *vc = NULL;
     int mpi_errno = MPI_SUCCESS;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_FOP_OP);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_FOP_OP);
 
-    if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+    MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &vc);
+
+    if ((win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) ||
+        (win_ptr->shm_allocated == TRUE && vc->ch.is_local)) {
         base = win_ptr->shm_base_addrs[target_rank];
         disp_unit = win_ptr->disp_units[target_rank];
 

http://git.mpich.org/mpich.git/commitdiff/55cb0f4c1cdc5c1e7017d487bc2f13871dcf766e

commit 55cb0f4c1cdc5c1e7017d487bc2f13871dcf766e
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Fri Jul 26 08:56:28 2013 -0500

    Call DO_SHM_RMA_OP in fence/unlock/complete.
    
    decrement datatype reference counts in SHM RMA operations.
    because they will not be referenced by the progress engine, but be
    completed directly by origin.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_rma_sync.c b/src/mpid/ch3/src/ch3u_rma_sync.c
index f05a5bb..4852465 100644
--- a/src/mpid/ch3/src/ch3u_rma_sync.c
+++ b/src/mpid/ch3/src/ch3u_rma_sync.c
@@ -202,6 +202,122 @@ static int create_datatype(const MPIDI_RMA_dtype_info *dtype_info,
     } while (0)
 
 
+/* Perform RMA operation asynchronously if window of processes on the same node
+   is allocated on shared memory */
+#define MPIDI_CH3I_DO_SHM_OP(op_ptr_, win_ptr_, err_)                                                                       \
+    do {                                                                                                                    \
+    switch ((op_ptr_)->type)                                                                                                \
+    {                                                                                                                       \
+        int predefined_;                                                                                                    \
+        MPID_Datatype *dtp_;                                                                                                \
+        case (MPIDI_RMA_PUT):                                                                                               \
+            (err_) = MPIDI_CH3I_Shm_put_op((op_ptr_)->origin_addr, (op_ptr_)->origin_count, (op_ptr_)->origin_datatype,     \
+                                           (op_ptr_)->target_rank, (op_ptr_)->target_disp, (op_ptr_)->target_count,         \
+                                           (op_ptr_)->target_datatype, (win_ptr_));                                         \
+            if (err_) {MPIU_ERR_POP(err_);}                                                                                 \
+            /* if source or target datatypes are derived, decrement their                                                   \
+               reference counts because they will not be referenced by progress engine */                                   \
+            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->origin_datatype, predefined_);                                     \
+            if (!predefined_)                                                                                               \
+            {                                                                                                               \
+                MPID_Datatype_get_ptr((op_ptr_)->origin_datatype, dtp_);                                                    \
+                MPID_Datatype_release(dtp_);                                                                                \
+            }                                                                                                               \
+            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->target_datatype, predefined_);                                     \
+            if (!predefined_)                                                                                               \
+            {                                                                                                               \
+                MPID_Datatype_get_ptr((op_ptr_)->target_datatype, dtp_);                                                    \
+                MPID_Datatype_release(dtp_);                                                                                \
+            }                                                                                                               \
+            break;                                                                                                          \
+        case (MPIDI_RMA_ACCUMULATE):                                                                                        \
+        case (MPIDI_RMA_ACC_CONTIG):                                                                                        \
+            (err_) = MPIDI_CH3I_Shm_acc_op((op_ptr_)->origin_addr, (op_ptr_)->origin_count, (op_ptr_)->origin_datatype,     \
+                                           (op_ptr_)->target_rank, (op_ptr_)->target_disp, (op_ptr_)->target_count,         \
+                                           (op_ptr_)->target_datatype, (op_ptr_)->op, (win_ptr_));                          \
+            if (err_) {MPIU_ERR_POP(err_);}                                                                                 \
+            /* if source or target datatypes are derived, decrement their                                                   \
+               reference counts because they will not be referenced by progress engine */                                   \
+            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->origin_datatype, predefined_);                                     \
+            if (!predefined_)                                                                                               \
+            {                                                                                                               \
+                MPID_Datatype_get_ptr((op_ptr_)->origin_datatype, dtp_);                                                    \
+                MPID_Datatype_release(dtp_);                                                                                \
+            }                                                                                                               \
+            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->target_datatype, predefined_);                                     \
+            if (!predefined_)                                                                                               \
+            {                                                                                                               \
+                MPID_Datatype_get_ptr((op_ptr_)->target_datatype, dtp_);                                                    \
+                MPID_Datatype_release(dtp_);                                                                                \
+            }                                                                                                               \
+            break;                                                                                                          \
+        case (MPIDI_RMA_GET_ACCUMULATE):                                                                                    \
+            (err_) = MPIDI_CH3I_Shm_get_acc_op((op_ptr_)->origin_addr, (op_ptr_)->origin_count, (op_ptr_)->origin_datatype, \
+                                               (op_ptr_)->result_addr, (op_ptr_)->result_count, (op_ptr_)->result_datatype, \
+                                               (op_ptr_)->target_rank, (op_ptr_)->target_disp, (op_ptr_)->target_count,     \
+                                               (op_ptr_)->target_datatype, (op_ptr_)->op, (win_ptr_));                      \
+            if (err_) {MPIU_ERR_POP(err_);}                                                                                 \
+            /* if source or target datatypes are derived, decrement their                                                   \
+               reference counts because they will not be referenced by progress engine */                                   \
+            predefined_ = TRUE;                                                                                             \
+            if ((op_ptr_)->op != MPI_NO_OP) {                                                                               \
+                MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->origin_datatype, predefined_);                                 \
+            }                                                                                                               \
+            if (!predefined_)                                                                                               \
+            {                                                                                                               \
+                MPID_Datatype_get_ptr((op_ptr_)->origin_datatype, dtp_);                                                    \
+                MPID_Datatype_release(dtp_);                                                                                \
+            }                                                                                                               \
+            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->result_datatype, predefined_);                                     \
+            if (!predefined_)                                                                                               \
+            {                                                                                                               \
+                MPID_Datatype_get_ptr((op_ptr_)->result_datatype, dtp_);                                                    \
+                MPID_Datatype_release(dtp_);                                                                                \
+            }                                                                                                               \
+            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->target_datatype, predefined_);                                     \
+            if (!predefined_)                                                                                               \
+            {                                                                                                               \
+                MPID_Datatype_get_ptr((op_ptr_)->target_datatype, dtp_);                                                    \
+                MPID_Datatype_release(dtp_);                                                                                \
+            }                                                                                                               \
+            break;                                                                                                          \
+        case (MPIDI_RMA_GET):                                                                                               \
+            (err_) = MPIDI_CH3I_Shm_get_op((op_ptr_)->origin_addr, (op_ptr_)->origin_count, (op_ptr_)->origin_datatype,     \
+                                           (op_ptr_)->target_rank, (op_ptr_)->target_disp, (op_ptr_)->target_count,         \
+                                           (op_ptr_)->target_datatype, (win_ptr_));                                         \
+            if (err_) {MPIU_ERR_POP(err_);}                                                                                 \
+            /* if source or target datatypes are derived, decrement their                                                   \
+               reference counts because they will not be referenced by progress engine */                                   \
+            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->origin_datatype, predefined_);                                     \
+            if (!predefined_)                                                                                               \
+            {                                                                                                               \
+                MPID_Datatype_get_ptr((op_ptr_)->origin_datatype, dtp_);                                                    \
+                MPID_Datatype_release(dtp_);                                                                                \
+            }                                                                                                               \
+            MPIDI_CH3I_DATATYPE_IS_PREDEFINED((op_ptr_)->target_datatype, predefined_);                                     \
+            if (!predefined_)                                                                                               \
+            {                                                                                                               \
+                MPID_Datatype_get_ptr((op_ptr_)->target_datatype, dtp_);                                                    \
+                MPID_Datatype_release(dtp_);                                                                                \
+            }                                                                                                               \
+            break;                                                                                                          \
+        case (MPIDI_RMA_COMPARE_AND_SWAP):                                                                                  \
+            (err_) = MPIDI_CH3I_Shm_cas_op((op_ptr_)->origin_addr, (op_ptr_)->compare_addr, (op_ptr_)->result_addr,         \
+                                           (op_ptr_)->origin_datatype, (op_ptr_)->target_rank, (op_ptr_)->target_disp,      \
+                                           (win_ptr_));                                                                     \
+            if (err_) {MPIU_ERR_POP(err_);}                                                                                 \
+            break;                                                                                                          \
+        case (MPIDI_RMA_FETCH_AND_OP):                                                                                      \
+            (err_) = MPIDI_CH3I_Shm_fop_op((op_ptr_)->origin_addr, (op_ptr_)->result_addr, (op_ptr_)->origin_datatype,      \
+                                           (op_ptr_)->target_rank, (op_ptr_)->target_disp, (op_ptr_)->op, (win_ptr_));      \
+            if (err_) {MPIU_ERR_POP(err_);}                                                                                 \
+            break;                                                                                                          \
+        default:                                                                                                            \
+            MPIU_ERR_SETANDJUMP(err_,MPI_ERR_OTHER,"**winInvalidOp");                                                       \
+    }                                                                                                                       \
+    } while (0)
+
+
 #undef FUNCNAME
 #define FUNCNAME MPIDI_Win_fence
 #undef FCNAME
@@ -309,9 +425,13 @@ int MPIDI_Win_fence(int assert, MPID_Win *win_ptr)
         curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
 	while (curr_ptr != NULL)
 	{
-	    total_op_count++;
-	    rma_target_proc[curr_ptr->target_rank] = 1;
-	    nops_to_proc[curr_ptr->target_rank]++;
+	    MPIDI_VC_t *vc = NULL;
+	    MPIDI_Comm_get_vc(win_ptr->comm_ptr, curr_ptr->target_rank, &vc);
+	    if (!(win_ptr->shm_allocated == TRUE && vc->ch.is_local)) {
+		total_op_count++;
+		rma_target_proc[curr_ptr->target_rank] = 1;
+		nops_to_proc[curr_ptr->target_rank]++;
+	    }
 	    curr_ptr = curr_ptr->next;
 	}
 	
@@ -347,6 +467,14 @@ int MPIDI_Win_fence(int assert, MPID_Win *win_ptr)
         curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
 	while (curr_ptr != NULL)
 	{
+          MPIDI_VC_t *vc = NULL;
+          MPIDI_Comm_get_vc(win_ptr->comm_ptr, curr_ptr->target_rank, &vc);
+          if (win_ptr->shm_allocated == TRUE && vc->ch.is_local) {
+            MPIDI_CH3I_DO_SHM_OP(curr_ptr, win_ptr, mpi_errno);
+            MPIDI_CH3I_RMA_Ops_free_and_next(ops_list, &curr_ptr);
+          }
+          else {
+
             MPIDI_CH3_Pkt_flags_t flags = MPIDI_CH3_PKT_FLAG_NONE;
 
 	    /* The completion counter at the target is decremented only on 
@@ -388,6 +516,7 @@ int MPIDI_Win_fence(int assert, MPID_Win *win_ptr)
 		    nRequestNew = nRequest;
 		}
 	    }
+	  } /* end of else */
 	}
 	MPIU_INSTR_DURATION_END(winfence_issue);
 
@@ -1628,8 +1757,12 @@ int MPIDI_Win_complete(MPID_Win *win_ptr)
     curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
     while (curr_ptr != NULL)
     {
-	nops_to_proc[curr_ptr->target_rank]++;
-	total_op_count++;
+	MPIDI_VC_t *vc = NULL;
+	MPIDI_Comm_get_vc(win_ptr->comm_ptr, curr_ptr->target_rank, &vc);
+	if (!(win_ptr->shm_allocated == TRUE && vc->ch.is_local)) {
+	    nops_to_proc[curr_ptr->target_rank]++;
+	    total_op_count++;
+	}
 	curr_ptr = curr_ptr->next;
     }
 
@@ -1648,6 +1781,14 @@ int MPIDI_Win_complete(MPID_Win *win_ptr)
     curr_ptr = MPIDI_CH3I_RMA_Ops_head(ops_list);
     while (curr_ptr != NULL)
     {
+      MPIDI_VC_t *vc = NULL;
+      MPIDI_Comm_get_vc(win_ptr->comm_ptr, curr_ptr->target_rank, &vc);
+      if (win_ptr->shm_allocated == TRUE && vc->ch.is_local) {
+        MPIDI_CH3I_DO_SHM_OP(curr_ptr, win_ptr, mpi_errno);
+        MPIDI_CH3I_RMA_Ops_free_and_next(ops_list, &curr_ptr);
+      }
+      else {
+
         MPIDI_CH3_Pkt_flags_t flags = MPIDI_CH3_PKT_FLAG_NONE;
 
 	/* The completion counter at the target is decremented only on 
@@ -1683,6 +1824,7 @@ int MPIDI_Win_complete(MPID_Win *win_ptr)
 		nRequestNew = nRequest;
 	    }
 	}
+      }  /* end of else */
     }
     MPIU_INSTR_DURATION_END(wincomplete_issue);
         
@@ -2509,6 +2651,7 @@ static int MPIDI_CH3I_Do_passive_target_rma(MPID_Win *win_ptr, int target_rank,
     MPIDI_RMA_Op_t *curr_ptr;
     MPI_Win source_win_handle = MPI_WIN_NULL, target_win_handle = MPI_WIN_NULL;
     int nRequest=0, nRequestNew=0;
+    MPIDI_VC_t *vc = NULL;
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_DO_PASSIVE_TARGET_RMA);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_DO_PASSIVE_TARGET_RMA);
@@ -2518,6 +2661,20 @@ static int MPIDI_CH3I_Do_passive_target_rma(MPID_Win *win_ptr, int target_rank,
                 (win_ptr->targets[target_rank].remote_lock_state == MPIDI_CH3_WIN_LOCK_CALLED &&
                  win_ptr->targets[target_rank].remote_lock_assert & MPI_MODE_NOCHECK));
 
+    /* if alloc_shm is enabled and target process is on the same node,
+       directly perform RMA operations at the origin side and remove them
+       from passive RMA operation list */
+
+    MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &vc);
+    if (win_ptr->shm_allocated == TRUE && vc->ch.is_local) {
+        curr_ptr = MPIDI_CH3I_RMA_Ops_head(&win_ptr->targets[target_rank].rma_ops_list);
+        while (curr_ptr != NULL) {
+            MPIU_Assert(curr_ptr->target_rank == target_rank);
+            MPIDI_CH3I_DO_SHM_OP(curr_ptr, win_ptr, mpi_errno);
+            MPIDI_CH3I_RMA_Ops_free_and_next(&win_ptr->targets[target_rank].rma_ops_list, &curr_ptr);
+        }
+    }
+
     if (win_ptr->targets[target_rank].remote_lock_mode == MPI_LOCK_EXCLUSIVE &&
         win_ptr->targets[target_rank].remote_lock_state != MPIDI_CH3_WIN_LOCK_CALLED) {
         /* Exclusive lock already held -- no need to wait for rma done pkt at

http://git.mpich.org/mpich.git/commitdiff/b668bc1ed3168790d8361ff14e1976581349cb81

commit b668bc1ed3168790d8361ff14e1976581349cb81
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Jul 22 10:18:29 2013 -0500

    add "alloc_shm" info to MPIDI_Win_get_info.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/src/mpid/ch3/src/ch3u_win_fns.c b/src/mpid/ch3/src/ch3u_win_fns.c
index b44bae2..ef7d84c 100644
--- a/src/mpid/ch3/src/ch3u_win_fns.c
+++ b/src/mpid/ch3/src/ch3u_win_fns.c
@@ -382,6 +382,13 @@ int MPIDI_Win_get_info(MPID_Win *win, MPID_Info **info_used)
 
     if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
 
+    if (win->info_args.alloc_shm == TRUE)
+        mpi_errno = MPIR_Info_set_impl(*info_used, "alloc_shm", "true");
+    else
+        mpi_errno = MPIR_Info_set_impl(*info_used, "alloc_shm", "false");
+
+    if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+
     if (win->create_flavor == MPI_WIN_FLAVOR_SHARED) {
         if (win->info_args.alloc_shared_noncontig)
             mpi_errno = MPIR_Info_set_impl(*info_used, "alloc_shared_noncontig", "true");

http://git.mpich.org/mpich.git/commitdiff/384d96b7b7f193963a178e67ebb64d750751f3cc

commit 384d96b7b7f193963a178e67ebb64d750751f3cc
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Jul 22 10:17:30 2013 -0500

    Add "alloc_shm" info to MPI_Win_allocate.
    
    Add "alloc_shm" to window's info arguments and initialize it to FALSE.
    In MPID_Win_allocate, if "alloc_shm" is set to true, call ALLOCATE_SHARED,
    otherwise call ALLOCATE.
    
    Free window memory only when SHM region is not allocated, therwise it is
    already freed in MPIDI_CH3I_SHM_Win_free.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h
index 35ddac4..8210c97 100644
--- a/src/mpid/ch3/include/mpidpre.h
+++ b/src/mpid/ch3/include/mpidpre.h
@@ -244,6 +244,7 @@ struct MPIDI_Win_info_args {
     int accumulate_ops;
     int same_size;              /* valid flavor = allocate */
     int alloc_shared_noncontig; /* valid flavor = allocate shared */
+    int alloc_shm;              /* valid flavor = allocate */
 };
 
 struct MPIDI_RMA_op;            /* forward decl from mpidrma.h */
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 5f6fb4c..b57283c 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -67,7 +67,8 @@ int MPIDI_Win_free(MPID_Win **win_ptr)
 
     /* Free the attached buffer for windows created with MPI_Win_allocate() */
     if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE && (*win_ptr)->size > 0) {
-      MPIU_Free((*win_ptr)->base);
+        if ((*win_ptr)->shm_allocated != TRUE)
+            MPIU_Free((*win_ptr)->base);
     }
 
     MPIU_Object_release_ref(*win_ptr, &in_use);
diff --git a/src/mpid/ch3/src/mpid_rma.c b/src/mpid/ch3/src/mpid_rma.c
index 48c27e8..d330933 100644
--- a/src/mpid/ch3/src/mpid_rma.c
+++ b/src/mpid/ch3/src/mpid_rma.c
@@ -109,8 +109,22 @@ int MPID_Win_allocate(MPI_Aint size, int disp_unit, MPID_Info *info,
     mpi_errno = win_init(size, disp_unit, MPI_WIN_FLAVOR_ALLOCATE, MPI_WIN_UNIFIED, comm_ptr, win_ptr);
     if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
 
-    mpi_errno = MPIDI_CH3U_Win_fns.allocate(size, disp_unit, info, comm_ptr, baseptr, win_ptr);
-    if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+    if (info != NULL) {
+        int alloc_shm_flag = 0;
+        char shm_alloc_value[MPI_MAX_INFO_VAL+1];
+        MPIR_Info_get_impl(info, "alloc_shm", MPI_MAX_INFO_VAL, shm_alloc_value, &alloc_shm_flag);
+        if ((alloc_shm_flag == 1) && (!strncmp(shm_alloc_value, "true", sizeof("true"))))
+            (*win_ptr)->info_args.alloc_shm = TRUE;
+    }
+
+    if((*win_ptr)->info_args.alloc_shm == TRUE) {
+        mpi_errno = MPIDI_CH3U_Win_fns.allocate_shared(size, disp_unit, info, comm_ptr, baseptr, win_ptr);
+        if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno);
+    }
+    else {
+        mpi_errno = MPIDI_CH3U_Win_fns.allocate(size, disp_unit, info, comm_ptr, baseptr, win_ptr);
+        if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
+    }
 
  fn_fail:
     MPIDI_FUNC_EXIT(MPID_STATE_MPID_WIN_ALLOCATE);
@@ -298,6 +312,7 @@ static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
     (*win_ptr)->info_args.accumulate_ops      = MPIDI_ACC_OPS_SAME_OP_NO_OP;
     (*win_ptr)->info_args.same_size           = 0;
     (*win_ptr)->info_args.alloc_shared_noncontig = 0;
+    (*win_ptr)->info_args.alloc_shm = FALSE;
 
     MPID_WIN_FTABLE_SET_DEFAULTS(win_ptr);
 

http://git.mpich.org/mpich.git/commitdiff/44d9c98dc934631af208c62113703c135bd86064

commit 44d9c98dc934631af208c62113703c135bd86064
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Jul 22 10:08:58 2013 -0500

    Refactoring the code of SHM RMA operations from operation routines to inline functions.
    
    The code in inline functions is moved from operation routines in
    ch3u_rma_ops.c and ch3u_rma_acc_ops.c. By moving them in inline
    functions, both operation routines and synchronization routines can call them.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index 8abc78d..ecb59c3 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -7,6 +7,7 @@
 #define MPICH_MPIDRMA_H_INCLUDED
 
 #include "mpl_utlist.h"
+#include "mpidi_ch3_impl.h"
 
 #ifdef USE_MPIU_INSTR
 MPIU_INSTR_DURATION_EXTERN_DECL(wincreate_allgather);
@@ -308,6 +309,528 @@ static inline MPIDI_RMA_Ops_list_t *MPIDI_CH3I_RMA_Get_ops_list(MPID_Win *win_pt
 }
 
 
+/* ------------------------------------------------------------------------ */
+/*
+ * Followings are new routines for origin completion for RMA operations.
+ */
+/* ------------------------------------------------------------------------ */
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Shm_put_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Shm_put_op(const void *origin_addr, int origin_count, MPI_Datatype
+                                        origin_datatype, int target_rank, MPI_Aint target_disp,
+                                        int target_count, MPI_Datatype target_datatype,
+                                        MPID_Win *win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    void *base = NULL;
+    int disp_unit;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_PUT_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_PUT_OP);
+
+    if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+        base = win_ptr->shm_base_addrs[target_rank];
+        disp_unit = win_ptr->disp_units[target_rank];
+    }
+    else {
+        base = win_ptr->base;
+        disp_unit = win_ptr->disp_unit;
+    }
+
+    mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype,
+                               (char *) base + disp_unit * target_disp,
+                               target_count, target_datatype);
+    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+
+ fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_PUT_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+ fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Shm_acc_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_count, MPI_Datatype
+                                        origin_datatype, int target_rank, MPI_Aint target_disp,
+                                        int target_count, MPI_Datatype target_datatype, MPI_Op op,
+                                        MPID_Win *win_ptr)
+{
+    void *base = NULL;
+    int disp_unit, shm_op = 0;
+    int origin_predefined, target_predefined;
+    MPI_User_function *uop = NULL;
+    MPID_Datatype *dtp;
+    int mpi_errno = MPI_SUCCESS;
+    MPIU_CHKLMEM_DECL(2);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);
+
+    MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined);
+    MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined);
+
+    if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+        shm_op = 1;
+        base = win_ptr->shm_base_addrs[target_rank];
+        disp_unit = win_ptr->disp_units[target_rank];
+    }
+    else {
+        base = win_ptr->base;
+        disp_unit = win_ptr->disp_unit;
+    }
+
+    if (op == MPI_REPLACE)
+    {
+        if (shm_op) MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+        mpi_errno = MPIR_Localcopy(origin_addr, origin_count,
+                                   origin_datatype,
+                                   (char *) base + disp_unit * target_disp,
+                                   target_count, target_datatype);
+        if (shm_op) MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+        goto fn_exit;
+    }
+
+    MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN),
+                         mpi_errno, MPI_ERR_OP, "**opnotpredefined",
+                         "**opnotpredefined %d", op );
+
+    /* get the function by indexing into the op table */
+    uop = MPIR_OP_HDL_TO_FN(op);
+
+    if (origin_predefined && target_predefined)
+    {
+        /* Cast away const'ness for origin_address in order to
+         * avoid changing the prototype for MPI_User_function */
+        if (shm_op) MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+        (*uop)((void *) origin_addr, (char *) base + disp_unit*target_disp,
+               &target_count, &target_datatype);
+        if (shm_op) MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+    }
+    else
+    {
+        /* derived datatype */
+
+        MPID_Segment *segp;
+        DLOOP_VECTOR *dloop_vec;
+        MPI_Aint first, last;
+        int vec_len, i, type_size, count;
+        MPI_Datatype type;
+        MPI_Aint true_lb, true_extent, extent;
+        void *tmp_buf=NULL, *target_buf;
+        const void *source_buf;
+
+        if (origin_datatype != target_datatype)
+        {
+            /* first copy the data into a temporary buffer with
+               the same datatype as the target. Then do the
+               accumulate operation. */
+
+            MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent);
+            MPID_Datatype_get_extent_macro(target_datatype, extent);
+
+            MPIU_CHKLMEM_MALLOC(tmp_buf, void *,
+                                target_count * (MPIR_MAX(extent,true_extent)),
+                                mpi_errno, "temporary buffer");
+            /* adjust for potential negative lower bound in datatype */
+            tmp_buf = (void *)((char*)tmp_buf - true_lb);
+
+            mpi_errno = MPIR_Localcopy(origin_addr, origin_count,
+                                       origin_datatype, tmp_buf,
+                                       target_count, target_datatype);
+            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+        }
+
+        if (target_predefined) {
+            /* target predefined type, origin derived datatype */
+
+            if (shm_op) MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+            (*uop)(tmp_buf, (char *) base + disp_unit * target_disp,
+                   &target_count, &target_datatype);
+            if (shm_op) MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+        }
+        else {
+
+            segp = MPID_Segment_alloc();
+            MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER,
+                                 "**nomem","**nomem %s","MPID_Segment_alloc");
+            MPID_Segment_init(NULL, target_count, target_datatype, segp, 0);
+            first = 0;
+            last  = SEGMENT_IGNORE_LAST;
+
+            MPID_Datatype_get_ptr(target_datatype, dtp);
+            vec_len = dtp->max_contig_blocks * target_count + 1;
+            /* +1 needed because Rob says so */
+            MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *,
+                                vec_len * sizeof(DLOOP_VECTOR),
+                                mpi_errno, "dloop vector");
+
+            MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);
+
+            source_buf = (tmp_buf != NULL) ? tmp_buf : origin_addr;
+            target_buf = (char *) base + disp_unit * target_disp;
+            type = dtp->eltype;
+            type_size = MPID_Datatype_get_basic_size(type);
+            if (shm_op) MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+            for (i=0; i<vec_len; i++)
+            {
+                count = (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size;
+                (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
+                       (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
+                       &count, &type);
+            }
+            if (shm_op) MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+
+            MPID_Segment_free(segp);
+        }
+    }
+
+ fn_exit:
+    MPIU_CHKLMEM_FREEALL();
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+ fn_fail:
+    if (shm_op) MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Shm_get_acc_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_count, MPI_Datatype
+                                            origin_datatype, void *result_addr, int result_count,
+                                            MPI_Datatype result_datatype, int target_rank, MPI_Aint
+                                            target_disp, int target_count, MPI_Datatype target_datatype,
+                                            MPI_Op op, MPID_Win *win_ptr)
+{
+    int disp_unit, shm_locked = 0;
+    void *base = NULL;
+    MPI_User_function *uop = NULL;
+    MPID_Datatype *dtp;
+    int origin_predefined, result_predefined, target_predefined;
+    int mpi_errno = MPI_SUCCESS;
+    MPIU_CHKLMEM_DECL(2);
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);
+
+    origin_predefined = TRUE; /* quiet uninitialized warnings (b/c goto) */
+    if (op != MPI_NO_OP) {
+        MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined);
+    }
+    MPIDI_CH3I_DATATYPE_IS_PREDEFINED(result_datatype, result_predefined);
+    MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined);
+
+    if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+        base = win_ptr->shm_base_addrs[target_rank];
+        disp_unit = win_ptr->disp_units[target_rank];
+        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+        shm_locked = 1;
+    }
+    else {
+        base = win_ptr->base;
+        disp_unit = win_ptr->disp_unit;
+    }
+
+    /* Perform the local get first, then the accumulate */
+    mpi_errno = MPIR_Localcopy((char *) base + disp_unit * target_disp,
+                               target_count, target_datatype,
+                               result_addr, result_count, result_datatype);
+    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+
+    /* NO_OP: Don't perform the accumulate */
+    if (op == MPI_NO_OP) {
+        if (shm_locked) {
+            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+            shm_locked = 0;
+        }
+
+        goto fn_exit;
+    }
+
+    if (op == MPI_REPLACE) {
+        mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype,
+                                   (char *) base + disp_unit * target_disp,
+                                   target_count, target_datatype);
+
+        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+
+        if (shm_locked) {
+            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+            shm_locked = 0;
+        }
+
+        goto fn_exit;
+    }
+
+    MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN),
+                          mpi_errno, MPI_ERR_OP, "**opnotpredefined",
+                          "**opnotpredefined %d", op );
+
+    /* get the function by indexing into the op table */
+    uop = MPIR_OP_HDL_TO_FN(op);
+
+    if (origin_predefined && target_predefined) {
+        /* Cast away const'ness for origin_address in order to
+         * avoid changing the prototype for MPI_User_function */
+        (*uop)((void *) origin_addr, (char *) base + disp_unit*target_disp,
+               &target_count, &target_datatype);
+    }
+    else {
+        /* derived datatype */
+
+        MPID_Segment *segp;
+        DLOOP_VECTOR *dloop_vec;
+        MPI_Aint first, last;
+        int vec_len, i, type_size, count;
+        MPI_Datatype type;
+        MPI_Aint true_lb, true_extent, extent;
+        void *tmp_buf=NULL, *target_buf;
+        const void *source_buf;
+
+        if (origin_datatype != target_datatype) {
+            /* first copy the data into a temporary buffer with
+               the same datatype as the target. Then do the
+               accumulate operation. */
+
+            MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent);
+            MPID_Datatype_get_extent_macro(target_datatype, extent);
+
+            MPIU_CHKLMEM_MALLOC(tmp_buf, void *,
+                                target_count * (MPIR_MAX(extent,true_extent)),
+                                mpi_errno, "temporary buffer");
+            /* adjust for potential negative lower bound in datatype */
+            tmp_buf = (void *)((char*)tmp_buf - true_lb);
+
+            mpi_errno = MPIR_Localcopy(origin_addr, origin_count,
+                                       origin_datatype, tmp_buf,
+                                       target_count, target_datatype);
+            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+        }
+
+        if (target_predefined) {
+            /* target predefined type, origin derived datatype */
+
+            (*uop)(tmp_buf, (char *) base + disp_unit * target_disp,
+                   &target_count, &target_datatype);
+        }
+        else {
+
+            segp = MPID_Segment_alloc();
+            MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER,
+                                 "**nomem","**nomem %s","MPID_Segment_alloc");
+            MPID_Segment_init(NULL, target_count, target_datatype, segp, 0);
+            first = 0;
+            last  = SEGMENT_IGNORE_LAST;
+
+            MPID_Datatype_get_ptr(target_datatype, dtp);
+            vec_len = dtp->max_contig_blocks * target_count + 1;
+            /* +1 needed because Rob says so */
+            MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *,
+                                vec_len * sizeof(DLOOP_VECTOR),
+                                mpi_errno, "dloop vector");
+
+            MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);
+
+            source_buf = (tmp_buf != NULL) ? tmp_buf : origin_addr;
+            target_buf = (char *) base + disp_unit * target_disp;
+            type = dtp->eltype;
+            type_size = MPID_Datatype_get_basic_size(type);
+
+            for (i=0; i<vec_len; i++) {
+                count = (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size;
+                (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
+                       (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
+                       &count, &type);
+            }
+
+            MPID_Segment_free(segp);
+        }
+    }
+
+    if (shm_locked) {
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+        shm_locked = 0;
+    }
+
+ fn_exit:
+    MPIU_CHKLMEM_FREEALL();
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+ fn_fail:
+    if (shm_locked) {
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+    }
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Shm_get_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Shm_get_op(void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                        int target_rank, MPI_Aint target_disp, int target_count,
+                                        MPI_Datatype target_datatype, MPID_Win *win_ptr)
+{
+    void *base = NULL;
+    int disp_unit;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_GET_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_GET_OP);
+
+    if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+        base = win_ptr->shm_base_addrs[target_rank];
+        disp_unit = win_ptr->disp_units[target_rank];
+    }
+    else {
+        base = win_ptr->base;
+        disp_unit = win_ptr->disp_unit;
+    }
+
+    mpi_errno = MPIR_Localcopy((char *) base + disp_unit * target_disp,
+                               target_count, target_datatype, origin_addr,
+                               origin_count, origin_datatype);
+    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+
+ fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_GET_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+ fn_fail:
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Shm_cas_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Shm_cas_op(const void *origin_addr, const void *compare_addr,
+                                        void *result_addr, MPI_Datatype datatype, int target_rank,
+                                        MPI_Aint target_disp, MPID_Win *win_ptr)
+{
+    void *base = NULL, *dest_addr = NULL;
+    int disp_unit;
+    int len, shm_locked = 0;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_CAS_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_CAS_OP);
+
+    if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+        base = win_ptr->shm_base_addrs[target_rank];
+        disp_unit = win_ptr->disp_units[target_rank];
+
+        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+        shm_locked = 1;
+    }
+    else {
+        base = win_ptr->base;
+        disp_unit = win_ptr->disp_unit;
+    }
+
+    dest_addr = (char *) base + disp_unit * target_disp;
+
+    MPID_Datatype_get_size_macro(datatype, len);
+    MPIU_Memcpy(result_addr, dest_addr, len);
+
+    if (MPIR_Compare_equal(compare_addr, dest_addr, datatype)) {
+        MPIU_Memcpy(dest_addr, origin_addr, len);
+    }
+
+    if (shm_locked) {
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+        shm_locked = 0;
+    }
+
+ fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_CAS_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+ fn_fail:
+    if (shm_locked) {
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+    }
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Shm_fop_op
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Shm_fop_op(const void *origin_addr, void *result_addr,
+                                        MPI_Datatype datatype, int target_rank,
+                                        MPI_Aint target_disp, MPI_Op op, MPID_Win *win_ptr)
+{
+    void *base = NULL, *dest_addr = NULL;
+    MPI_User_function *uop = NULL;
+    int disp_unit;
+    int len, one, shm_locked = 0;
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_FOP_OP);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_FOP_OP);
+
+    if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
+        base = win_ptr->shm_base_addrs[target_rank];
+        disp_unit = win_ptr->disp_units[target_rank];
+
+        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
+        shm_locked = 1;
+    }
+    else {
+        base = win_ptr->base;
+        disp_unit = win_ptr->disp_unit;
+    }
+
+    dest_addr = (char *) base + disp_unit * target_disp;
+
+    MPID_Datatype_get_size_macro(datatype, len);
+    MPIU_Memcpy(result_addr, dest_addr, len);
+
+    uop = MPIR_OP_HDL_TO_FN(op);
+    one = 1;
+
+    (*uop)((void *) origin_addr, dest_addr, &one, &datatype);
+
+    if (shm_locked) {
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+        shm_locked = 0;
+    }
+
+ fn_exit:
+    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_FOP_OP);
+    return mpi_errno;
+    /* --BEGIN ERROR HANDLING-- */
+ fn_fail:
+    if (shm_locked) {
+        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
+    }
+    goto fn_exit;
+    /* --END ERROR HANDLING-- */
+}
+
+
 #undef FUNCNAME
 #define FUNCNAME MPIDI_CH3I_Wait_for_pt_ops_finish
 #undef FCNAME
diff --git a/src/mpid/ch3/src/ch3u_rma_acc_ops.c b/src/mpid/ch3/src/ch3u_rma_acc_ops.c
index f50e6b1..82b1d2d 100644
--- a/src/mpid/ch3/src/ch3u_rma_acc_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_acc_ops.c
@@ -4,7 +4,6 @@
  *      See COPYRIGHT in top-level directory.
  */
 
-#include "mpidi_ch3_impl.h"
 #include "mpidrma.h"
 
 #ifdef USE_MPIU_INSTR
@@ -25,7 +24,6 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
     int mpi_errno = MPI_SUCCESS;
     MPIDI_msg_sz_t data_sz;
     int rank, origin_predefined, result_predefined, target_predefined;
-    int shm_locked = 0;
     int dt_contig ATTRIBUTE((unused));
     MPI_Aint dt_true_lb ATTRIBUTE((unused));
     MPID_Datatype *dtp;
@@ -64,141 +62,11 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
     /* Do =! rank first (most likely branch?) */
     if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED)
     {
-        MPI_User_function *uop;
-        void *base;
-        int disp_unit;
-
-        if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
-            base = win_ptr->shm_base_addrs[target_rank];
-            disp_unit = win_ptr->disp_units[target_rank];
-            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-            shm_locked = 1;
-        }
-        else {
-            base = win_ptr->base;
-            disp_unit = win_ptr->disp_unit;
-        }
-
-        /* Perform the local get first, then the accumulate */
-        mpi_errno = MPIR_Localcopy((char *) base + disp_unit * target_disp,
-                                   target_count, target_datatype,
-                                   result_addr, result_count, result_datatype);
-        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-
-        /* NO_OP: Don't perform the accumulate */
-        if (op == MPI_NO_OP) {
-            if (shm_locked) {
-                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-                shm_locked = 0;
-            }
-
-            goto fn_exit;
-        }
-
-        if (op == MPI_REPLACE) {
-            mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype,
-                                (char *) base + disp_unit * target_disp,
-                                target_count, target_datatype);
-
-            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-
-            if (shm_locked) {
-                MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-                shm_locked = 0;
-            }
-
-            goto fn_exit;
-        }
-
-        MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN),
-                             mpi_errno, MPI_ERR_OP, "**opnotpredefined",
-                             "**opnotpredefined %d", op );
-
-        /* get the function by indexing into the op table */
-        uop = MPIR_OP_HDL_TO_FN(op);
-
-        if (origin_predefined && target_predefined) {
-            /* Cast away const'ness for origin_address in order to
-             * avoid changing the prototype for MPI_User_function */
-            (*uop)((void *) origin_addr, (char *) base + disp_unit*target_disp,
-                   &target_count, &target_datatype);
-        }
-        else {
-            /* derived datatype */
-
-            MPID_Segment *segp;
-            DLOOP_VECTOR *dloop_vec;
-            MPI_Aint first, last;
-            int vec_len, i, type_size, count;
-            MPI_Datatype type;
-            MPI_Aint true_lb, true_extent, extent;
-            void *tmp_buf=NULL, *target_buf;
-            const void *source_buf;
-
-            if (origin_datatype != target_datatype) {
-                /* first copy the data into a temporary buffer with
-                   the same datatype as the target. Then do the
-                   accumulate operation. */
-
-                MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent);
-                MPID_Datatype_get_extent_macro(target_datatype, extent);
-
-                MPIU_CHKLMEM_MALLOC(tmp_buf, void *,
-                                    target_count * (MPIR_MAX(extent,true_extent)),
-                                    mpi_errno, "temporary buffer");
-                /* adjust for potential negative lower bound in datatype */
-                tmp_buf = (void *)((char*)tmp_buf - true_lb);
-
-                mpi_errno = MPIR_Localcopy(origin_addr, origin_count,
-                                           origin_datatype, tmp_buf,
-                                           target_count, target_datatype);
-                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-            }
-
-            if (target_predefined) {
-                /* target predefined type, origin derived datatype */
-
-                (*uop)(tmp_buf, (char *) base + disp_unit * target_disp,
-                       &target_count, &target_datatype);
-            }
-            else {
-
-                segp = MPID_Segment_alloc();
-                MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER,
-                                     "**nomem","**nomem %s","MPID_Segment_alloc");
-                MPID_Segment_init(NULL, target_count, target_datatype, segp, 0);
-                first = 0;
-                last  = SEGMENT_IGNORE_LAST;
-
-                MPID_Datatype_get_ptr(target_datatype, dtp);
-                vec_len = dtp->max_contig_blocks * target_count + 1;
-                /* +1 needed because Rob says so */
-                MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *,
-                                    vec_len * sizeof(DLOOP_VECTOR),
-                                    mpi_errno, "dloop vector");
-
-                MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);
-
-                source_buf = (tmp_buf != NULL) ? tmp_buf : origin_addr;
-                target_buf = (char *) base + disp_unit * target_disp;
-                type = dtp->eltype;
-                type_size = MPID_Datatype_get_basic_size(type);
-
-                for (i=0; i<vec_len; i++) {
-                    count = (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size;
-                    (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
-                           (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
-                           &count, &type);
-                }
-
-                MPID_Segment_free(segp);
-            }
-        }
-
-        if (shm_locked) {
-            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-            shm_locked = 0;
-        }
+        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
+                                              result_addr, result_count, result_datatype,
+                                              target_rank, target_disp, target_count, target_datatype,
+                                              op, win_ptr);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     }
     else {
         MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
@@ -252,9 +120,6 @@ int MPIDI_Get_accumulate(const void *origin_addr, int origin_count,
 
     /* --BEGIN ERROR HANDLING-- */
   fn_fail:
-    if (shm_locked) {
-        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-    }
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -269,7 +134,6 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
                           MPI_Aint target_disp, MPID_Win *win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
-    int shm_locked = 0;
     int rank;
 
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_COMPARE_AND_SWAP);
@@ -296,35 +160,9 @@ int MPIDI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
      * that uses a processor atomic operation. */
     if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED)
     {
-        void *base, *dest_addr;
-        int disp_unit;
-        int len;
-
-        if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
-            base = win_ptr->shm_base_addrs[target_rank];
-            disp_unit = win_ptr->disp_units[target_rank];
-
-            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-            shm_locked = 1;
-        }
-        else {
-            base = win_ptr->base;
-            disp_unit = win_ptr->disp_unit;
-        }
-
-        dest_addr = (char *) base + disp_unit * target_disp;
-
-        MPID_Datatype_get_size_macro(datatype, len);
-        MPIU_Memcpy(result_addr, dest_addr, len);
-
-        if (MPIR_Compare_equal(compare_addr, dest_addr, datatype)) {
-            MPIU_Memcpy(dest_addr, origin_addr, len);
-        }
-
-        if (shm_locked) {
-            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-            shm_locked = 0;
-        }
+        mpi_errno = MPIDI_CH3I_Shm_cas_op(origin_addr, compare_addr, result_addr,
+                                          datatype, target_rank, target_disp, win_ptr);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     }
     else {
         MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
@@ -359,9 +197,6 @@ fn_exit:
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
 fn_fail:
-    if (shm_locked) {
-        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-    }
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
@@ -376,7 +211,6 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
                        MPI_Aint target_disp, MPI_Op op, MPID_Win *win_ptr)
 {
     int mpi_errno = MPI_SUCCESS;
-    int shm_locked = 0;
     int rank;
 
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_FETCH_AND_OP);
@@ -402,37 +236,9 @@ int MPIDI_Fetch_and_op(const void *origin_addr, void *result_addr,
      * that uses a processor atomic operation. */
     if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED)
     {
-        MPI_User_function *uop;
-        void *base, *dest_addr;
-        int disp_unit;
-        int len, one;
-
-        if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
-            base = win_ptr->shm_base_addrs[target_rank];
-            disp_unit = win_ptr->disp_units[target_rank];
-
-            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-            shm_locked = 1;
-        }
-        else {
-            base = win_ptr->base;
-            disp_unit = win_ptr->disp_unit;
-        }
-
-        dest_addr = (char *) base + disp_unit * target_disp;
-
-        MPID_Datatype_get_size_macro(datatype, len);
-        MPIU_Memcpy(result_addr, dest_addr, len);
-
-        uop = MPIR_OP_HDL_TO_FN(op);
-        one = 1;
-
-        (*uop)((void *) origin_addr, dest_addr, &one, &datatype);
-
-        if (shm_locked) {
-            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-            shm_locked = 0;
-        }
+        mpi_errno = MPIDI_CH3I_Shm_fop_op(origin_addr, result_addr, datatype,
+                                          target_rank, target_disp, op, win_ptr);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     }
     else {
         MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
@@ -465,9 +271,6 @@ fn_exit:
     return mpi_errno;
     /* --BEGIN ERROR HANDLING-- */
 fn_fail:
-    if (shm_locked) {
-        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-    }
     goto fn_exit;
     /* --END ERROR HANDLING-- */
 }
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 7189513..5f6fb4c 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -4,7 +4,6 @@
  *      See COPYRIGHT in top-level directory.
  */
 
-#include "mpidi_ch3_impl.h"
 #include "mpidrma.h"
 
 static int enableShortACC=1;
@@ -182,22 +181,9 @@ int MPIDI_Put(const void *origin_addr, int origin_count, MPI_Datatype
     /* If the put is a local operation, do it here */
     if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED)
     {
-        void *base;
-        int disp_unit;
-
-        if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
-            base = win_ptr->shm_base_addrs[target_rank];
-            disp_unit = win_ptr->disp_units[target_rank];
-        }
-        else {
-            base = win_ptr->base;
-            disp_unit = win_ptr->disp_unit;
-        }
-
-        mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype,
-                                   (char *) base + disp_unit * target_disp,
-                                   target_count, target_datatype);
-        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
+                                          target_disp, target_count, target_datatype, win_ptr);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     }
     else
     {
@@ -293,22 +279,9 @@ int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype
     /* If the get is a local operation, do it here */
     if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED)
     {
-        void *base;
-        int disp_unit;
-
-        if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
-            base = win_ptr->shm_base_addrs[target_rank];
-            disp_unit = win_ptr->disp_units[target_rank];
-        }
-        else {
-            base = win_ptr->base;
-            disp_unit = win_ptr->disp_unit;
-        }
-
-        mpi_errno = MPIR_Localcopy((char *) base + disp_unit * target_disp,
-                                   target_count, target_datatype, origin_addr,
-                                   origin_count, origin_datatype);
-        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+        mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
+                                          target_disp, target_count, target_datatype, win_ptr);
+        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     }
     else
     {
@@ -406,125 +379,10 @@ int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
     /* Do =! rank first (most likely branch?) */
     if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED)
     {
-        MPI_User_function *uop;
-        void *base;
-        int disp_unit, shm_op = 0;
-
-        if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
-            shm_op = 1;
-            base = win_ptr->shm_base_addrs[target_rank];
-            disp_unit = win_ptr->disp_units[target_rank];
-        }
-        else {
-            base = win_ptr->base;
-            disp_unit = win_ptr->disp_unit;
-        }
-	
-	if (op == MPI_REPLACE)
-	{
-            if (shm_op) MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-            mpi_errno = MPIR_Localcopy(origin_addr, origin_count,
-                                       origin_datatype,
-                                       (char *) base + disp_unit * target_disp,
-                                       target_count, target_datatype);
-            if (shm_op) MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-	    goto fn_exit;
-	}
-	
-	MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN), 
-			     mpi_errno, MPI_ERR_OP, "**opnotpredefined",
-			     "**opnotpredefined %d", op );
-	
-	/* get the function by indexing into the op table */
-	uop = MPIR_OP_HDL_TO_FN(op);
-	
-	if (origin_predefined && target_predefined)
-	{    
-            /* Cast away const'ness for origin_address in order to
-             * avoid changing the prototype for MPI_User_function */
-            if (shm_op) MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-            (*uop)((void *) origin_addr, (char *) base + disp_unit*target_disp,
-                   &target_count, &target_datatype);
-            if (shm_op) MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-	}
-	else
-	{
-	    /* derived datatype */
-	    
-	    MPID_Segment *segp;
-	    DLOOP_VECTOR *dloop_vec;
-	    MPI_Aint first, last;
-	    int vec_len, i, type_size, count;
-	    MPI_Datatype type;
-	    MPI_Aint true_lb, true_extent, extent;
-	    void *tmp_buf=NULL, *target_buf;
-            const void *source_buf;
-	    
-	    if (origin_datatype != target_datatype)
-	    {
-		/* first copy the data into a temporary buffer with
-		   the same datatype as the target. Then do the
-		   accumulate operation. */
-		
-		MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent);
-		MPID_Datatype_get_extent_macro(target_datatype, extent); 
-		
-		MPIU_CHKLMEM_MALLOC(tmp_buf, void *, 
-			target_count * (MPIR_MAX(extent,true_extent)), 
-			mpi_errno, "temporary buffer");
-		/* adjust for potential negative lower bound in datatype */
-		tmp_buf = (void *)((char*)tmp_buf - true_lb);
-		
-		mpi_errno = MPIR_Localcopy(origin_addr, origin_count,
-					   origin_datatype, tmp_buf,
-					   target_count, target_datatype);  
-		if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-	    }
-
-	    if (target_predefined) { 
-		/* target predefined type, origin derived datatype */
-
-                if (shm_op) MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-                (*uop)(tmp_buf, (char *) base + disp_unit * target_disp,
-                       &target_count, &target_datatype);
-                if (shm_op) MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-	    }
-	    else {
-	    
-		segp = MPID_Segment_alloc();
-		MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER, 
-				    "**nomem","**nomem %s","MPID_Segment_alloc"); 
-		MPID_Segment_init(NULL, target_count, target_datatype, segp, 0);
-		first = 0;
-		last  = SEGMENT_IGNORE_LAST;
-		
-		MPID_Datatype_get_ptr(target_datatype, dtp);
-		vec_len = dtp->max_contig_blocks * target_count + 1; 
-		/* +1 needed because Rob says so */
-		MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *, 
-				    vec_len * sizeof(DLOOP_VECTOR), 
-				    mpi_errno, "dloop vector");
-		
-		MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len);
-		
-		source_buf = (tmp_buf != NULL) ? tmp_buf : origin_addr;
-		target_buf = (char *) base + disp_unit * target_disp;
-		type = dtp->eltype;
-		type_size = MPID_Datatype_get_basic_size(type);
-                if (shm_op) MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
-		for (i=0; i<vec_len; i++)
-		{
-		    count = (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size;
-		    (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
-			   (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF),
-			   &count, &type);
-		}
-                if (shm_op) MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
-		
-		MPID_Segment_free(segp);
-	    }
-	}
+	mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
+					  target_rank, target_disp, target_count, target_datatype,
+					  op, win_ptr);
+	if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     }
     else
     {

http://git.mpich.org/mpich.git/commitdiff/382bdfb2a2d35025b906033bc63fea98e3c551e3

commit 382bdfb2a2d35025b906033bc63fea98e3c551e3
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Mon Jul 22 10:03:49 2013 -0500

    modify SHM_Win_allocate_shared and SHM_Win_free to accomodate global comm.
    
    MPIDI_CH3I_Win_allocate_shared can be called by both MPI_Win_allocate_shared
    and MPI_Win_allocate. If it is called by MPI_Win_allocate, we need node_comm,
    node_sizes and node_shm_base_addrs to allocate shm segment region, hence we need
    to copy from win_ptr->sizes to node_sizes at beginning and copy from node_shm_base_addrs
    to win_ptr->shm_base_addrs at last. If it is called by MPI_Win_allocate_shared,
    these copies can be eliminated.
    
    If there is only one process on this node, node_comm is NULL, we use comm_self instead.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
index 9e3f7ef..6045f54 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
@@ -84,8 +84,19 @@ int MPIDI_CH3_SHM_Win_free(MPID_Win **win_ptr)
 
     /* Free shared process mutex memory region */
     if ((*win_ptr)->shm_mutex && (*win_ptr)->shm_segment_len > 0) {
-
-        if ((*win_ptr)->comm_ptr->rank == 0) {
+        MPID_Comm *node_comm_ptr = NULL;
+
+        /* When allocating shared memory region segment, we need comm of processes
+           that are on the same node as this process (node_comm).
+           If node_comm == NULL, this process is the only one on this node, therefore
+           we use comm_self as node comm. */
+        if ((*win_ptr)->comm_ptr->node_comm != NULL)
+            node_comm_ptr = (*win_ptr)->comm_ptr->node_comm;
+        else
+            node_comm_ptr = MPIR_Process.comm_self;
+        MPIU_Assert(node_comm_ptr != NULL);
+
+        if (node_comm_ptr->rank == 0) {
             MPIDI_CH3I_SHM_MUTEX_DESTROY(*win_ptr);
         }
 
diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
index 8942ed2..6b4fdce 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c
@@ -49,11 +49,15 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
 {
     int mpi_errno = MPI_SUCCESS;
     int i, k, comm_size, rank;
+    int  node_size, node_rank;
+    MPID_Comm *node_comm_ptr;
+    MPI_Aint *node_sizes;
+    void **node_shm_base_addrs;
     MPI_Aint *tmp_buf;
     int errflag = FALSE;
     int noncontig = FALSE;
     MPIU_CHKPMEM_DECL(6);
-    MPIU_CHKLMEM_DECL(1);
+    MPIU_CHKLMEM_DECL(3);
     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHARED);
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHARED);
@@ -77,6 +81,18 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
     comm_size = (*win_ptr)->comm_ptr->local_size;
     rank      = (*win_ptr)->comm_ptr->rank;
 
+    /* When allocating shared memory region segment, we need comm of processes
+       that are on the same node as this process (node_comm).
+       If node_comm == NULL, this process is the only one on this node, therefore
+       we use comm_self as node comm. */
+    if ((*win_ptr)->comm_ptr->node_comm != NULL)
+        node_comm_ptr = (*win_ptr)->comm_ptr->node_comm;
+    else
+        node_comm_ptr = MPIR_Process.comm_self;
+    MPIU_Assert(node_comm_ptr != NULL);
+    node_size = node_comm_ptr->local_size;
+    node_rank = node_comm_ptr->rank;
+
     MPIU_INSTR_DURATION_START(wincreate_allgather);
     /* allocate memory for the base addresses, disp_units, and
        completion counters of all processes */
@@ -119,6 +135,14 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
     MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
+    if ((*win_ptr)->create_flavor != MPI_WIN_FLAVOR_SHARED) {
+        MPIU_CHKLMEM_MALLOC(node_sizes, MPI_Aint *, node_size*sizeof(MPI_Aint), mpi_errno, "node_sizes");
+        for (i = 0; i < node_size; i++) node_sizes[i] = 0;
+    }
+    else {
+        node_sizes = (*win_ptr)->sizes;
+    }
+
     (*win_ptr)->shm_segment_len = 0;
     k = 0;
     for (i = 0; i < comm_size; ++i) {
@@ -126,11 +150,24 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
         (*win_ptr)->disp_units[i]      = (int) tmp_buf[k++];
         (*win_ptr)->all_win_handles[i] = (MPI_Win) tmp_buf[k++];
 
+        if ((*win_ptr)->create_flavor != MPI_WIN_FLAVOR_SHARED) {
+            /* If create flavor is not MPI_WIN_FLAVOR_SHARED, all processes on this
+               window may not be on the same node. Because we only need the sizes of local
+               processes (in order), we copy their sizes to a seperate array and keep them
+               in order, fur purpose of future use of calculating shm_base_addrs. */
+            if ((*win_ptr)->comm_ptr->intranode_table[i] >= 0) {
+                MPIU_Assert((*win_ptr)->comm_ptr->intranode_table[i] < node_size);
+                node_sizes[(*win_ptr)->comm_ptr->intranode_table[i]] = (*win_ptr)->sizes[i];
+            }
+        }
+    }
+
+    for (i = 0; i < node_size; i++) {
         if (noncontig)
             /* Round up to next page size */
-            (*win_ptr)->shm_segment_len += MPIDI_CH3_ROUND_UP_PAGESIZE((*win_ptr)->sizes[i]);
+            (*win_ptr)->shm_segment_len += MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[i]);
         else
-            (*win_ptr)->shm_segment_len += (*win_ptr)->sizes[i];
+            (*win_ptr)->shm_segment_len += node_sizes[i];
     }
 
     if ((*win_ptr)->shm_segment_len == 0) {
@@ -141,7 +178,7 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
     mpi_errno = MPIU_SHMW_Hnd_init(&(*win_ptr)->shm_segment_handle);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-    if (rank == 0) {
+    if (node_rank == 0) {
         char *serialized_hnd_ptr = NULL;
 
         /* create shared memory region for all processes in win and map */
@@ -153,12 +190,12 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
         mpi_errno = MPIU_SHMW_Hnd_get_serialized_by_ref((*win_ptr)->shm_segment_handle, &serialized_hnd_ptr);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-        mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, (*win_ptr)->comm_ptr, &errflag);
+        mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
         /* wait for other processes to attach to win */
-        mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
+        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
@@ -170,7 +207,7 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
         char serialized_hnd[MPIU_SHMW_GHND_SZ] = {0};
 
         /* get serialized handle from rank 0 and deserialize it */
-        mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, (*win_ptr)->comm_ptr, &errflag);
+        mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
@@ -182,7 +219,7 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
                                          (char **)&(*win_ptr)->shm_base_addr, 0);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-        mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
+        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
     }
@@ -191,7 +228,7 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
     mpi_errno = MPIU_SHMW_Hnd_init(&(*win_ptr)->shm_mutex_segment_handle);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-    if (rank == 0) {
+    if (node_rank == 0) {
         char *serialized_hnd_ptr = NULL;
 
         /* create shared memory region for all processes in win and map */
@@ -205,12 +242,12 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
         mpi_errno = MPIU_SHMW_Hnd_get_serialized_by_ref((*win_ptr)->shm_mutex_segment_handle, &serialized_hnd_ptr);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-        mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, (*win_ptr)->comm_ptr, &errflag);
+        mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
         /* wait for other processes to attach to win */
-        mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
+        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
@@ -221,7 +258,7 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
         char serialized_hnd[MPIU_SHMW_GHND_SZ] = {0};
 
         /* get serialized handle from rank 0 and deserialize it */
-        mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, (*win_ptr)->comm_ptr, &errflag);
+        mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
 
@@ -233,25 +270,51 @@ static int MPIDI_CH3I_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Inf
                                          (char **)&(*win_ptr)->shm_mutex, 0);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 
-        mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
+        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
         MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
     }
 
     /* compute the base addresses of each process within the shared memory segment */
     {
+        if ((*win_ptr)->create_flavor != MPI_WIN_FLAVOR_SHARED) {
+            /* If create flavor is not MPI_WIN_FLAVOR_SHARED, all processes on this
+               window may not be on the same node. Because we only need to calculate
+               local processes' shm_base_addrs using local processes's sizes,
+               we allocate a temporary array to place results and copy results
+               back to shm_base_addrs on the window at last. */
+            MPIU_CHKLMEM_MALLOC(node_shm_base_addrs, void **, node_size*sizeof(void*),
+                                mpi_errno, "node_shm_base_addrs");
+        }
+        else {
+            node_shm_base_addrs = (*win_ptr)->shm_base_addrs;
+        }
+
         char *cur_base = (*win_ptr)->shm_base_addr;
-        (*win_ptr)->shm_base_addrs[0] = (*win_ptr)->shm_base_addr;
-        for (i = 1; i < comm_size; ++i) {
-            if ((*win_ptr)->sizes[i]) {
+        node_shm_base_addrs[0] = (*win_ptr)->shm_base_addr;
+        for (i = 1; i < node_size; ++i) {
+            if (node_sizes[i]) {
                 if (noncontig) {
-                    (*win_ptr)->shm_base_addrs[i] = cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE((*win_ptr)->sizes[i-1]);
+                    node_shm_base_addrs[i] = cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[i-1]);
                 } else {
-                    (*win_ptr)->shm_base_addrs[i] = cur_base + (*win_ptr)->sizes[i-1];
+                    node_shm_base_addrs[i] = cur_base + node_sizes[i-1];
                 }
-                cur_base = (*win_ptr)->shm_base_addrs[i];
+                cur_base = node_shm_base_addrs[i];
             } else {
-                (*win_ptr)->shm_base_addrs[i] = NULL; /* FIXME: Is this right? */
+                node_shm_base_addrs[i] = NULL; /* FIXME: Is this right? */
+            }
+        }
+
+        if ((*win_ptr)->create_flavor != MPI_WIN_FLAVOR_SHARED) {
+            /* if MPI_WIN_FLAVOR_SHARED is not set, copy from node_shm_base_addrs to
+               (*win_ptr)->shm_base_addrs */
+            for (i = 0; i < comm_size; i++) {
+                if ((*win_ptr)->comm_ptr->intranode_table[i] >= 0) {
+                    MPIU_Assert((*win_ptr)->comm_ptr->intranode_table[i] < node_size);
+                    (*win_ptr)->shm_base_addrs[i] = node_shm_base_addrs[(*win_ptr)->comm_ptr->intranode_table[i]];
+                }
+                else
+                    (*win_ptr)->shm_base_addrs[i] = NULL;
             }
         }
     }

http://git.mpich.org/mpich.git/commitdiff/e397148092dc2377b0671ebb22238a1a3456ea90

commit e397148092dc2377b0671ebb22238a1a3456ea90
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Fri Jul 26 15:42:25 2013 -0500

    Waiting for passive RMA operations to finish in MPIDI_CH3_SHM_Win_free.
    
    This is for the optimization of allocating shared memory region in
    MPI_Win_allocate. In this case MPIDI_CH3_SHM_Win_free must first wait
    for passive RMA operations to finish before free the shared memory region.
    
    Note that because MPIDI_CH3_SHM_Win_free calls MPIDI_Win_free at last,
    and MPIDI_Win_free will also call inline function of waiting for passive
    RMA operation to finish, in this case the inline function will be called
    twice.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
index 08c6a1d..9e3f7ef 100644
--- a/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
+++ b/src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c
@@ -64,6 +64,9 @@ int MPIDI_CH3_SHM_Win_free(MPID_Win **win_ptr)
 
     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE);
 
+    mpi_errno = MPIDI_CH3I_Wait_for_pt_ops_finish(*win_ptr);
+    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
+
     /* Free shared memory region */
     if ((*win_ptr)->shm_allocated) {
         /* free shm_base_addrs that's only used for shared memory windows */

http://git.mpich.org/mpich.git/commitdiff/5c6b971af446d9ffaafdb35da449c78b82b92d8e

commit 5c6b971af446d9ffaafdb35da449c78b82b92d8e
Author: Xin Zhao <xinzhao3 at illinois.edu>
Date:   Fri Jul 26 15:40:58 2013 -0500

    Refactoring code of waiting passive RMA operations.
    
    Moving code of waiting finish of passive RMA operations from MPIDI_Win_free
    to an inline function in mpidrma.h.
    
    Signed-off-by: Pavan Balaji <balaji at mcs.anl.gov>

diff --git a/src/mpid/ch3/include/mpidrma.h b/src/mpid/ch3/include/mpidrma.h
index 26d1b17..8abc78d 100644
--- a/src/mpid/ch3/include/mpidrma.h
+++ b/src/mpid/ch3/include/mpidrma.h
@@ -8,6 +8,12 @@
 
 #include "mpl_utlist.h"
 
+#ifdef USE_MPIU_INSTR
+MPIU_INSTR_DURATION_EXTERN_DECL(wincreate_allgather);
+MPIU_INSTR_DURATION_EXTERN_DECL(winfree_rs);
+MPIU_INSTR_DURATION_EXTERN_DECL(winfree_complete);
+#endif
+
 typedef enum MPIDI_RMA_Op_type {
     MPIDI_RMA_PUT               = 23,
     MPIDI_RMA_GET               = 24,
@@ -301,6 +307,59 @@ static inline MPIDI_RMA_Ops_list_t *MPIDI_CH3I_RMA_Get_ops_list(MPID_Win *win_pt
     }
 }
 
+
+#undef FUNCNAME
+#define FUNCNAME MPIDI_CH3I_Wait_for_pt_ops_finish
+#undef FCNAME
+#define FCNAME MPIDI_QUOTE(FUNCNAME)
+static inline int MPIDI_CH3I_Wait_for_pt_ops_finish(MPID_Win *win_ptr)
+{
+    int mpi_errno = MPI_SUCCESS, total_pt_rma_puts_accs;
+    MPID_Comm *comm_ptr;
+    int errflag = FALSE;
+    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_WAIT_FOR_PT_OPS_FINISH);
+
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WAIT_FOR_PT_OPS_FINISH);
+
+    comm_ptr = win_ptr->comm_ptr;
+    MPIU_INSTR_DURATION_START(winfree_rs);
+    mpi_errno = MPIR_Reduce_scatter_block_impl(win_ptr->pt_rma_puts_accs,
+                                               &total_pt_rma_puts_accs, 1,
+                                               MPI_INT, MPI_SUM, comm_ptr, &errflag);
+    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
+    MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
+    MPIU_INSTR_DURATION_END(winfree_rs);
+
+    if (total_pt_rma_puts_accs != win_ptr->my_pt_rma_puts_accs)
+    {
+	MPID_Progress_state progress_state;
+
+	/* poke the progress engine until the two are equal */
+	MPIU_INSTR_DURATION_START(winfree_complete);
+	MPID_Progress_start(&progress_state);
+	while (total_pt_rma_puts_accs != win_ptr->my_pt_rma_puts_accs)
+	{
+	    mpi_errno = MPID_Progress_wait(&progress_state);
+	    /* --BEGIN ERROR HANDLING-- */
+	    if (mpi_errno != MPI_SUCCESS)
+	    {
+		MPID_Progress_end(&progress_state);
+		MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
+	    }
+	    /* --END ERROR HANDLING-- */
+	}
+	MPID_Progress_end(&progress_state);
+	MPIU_INSTR_DURATION_END(winfree_complete);
+    }
+
+ fn_exit:
+    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WAIT_FOR_PT_OPS_FINISH);
+    return mpi_errno;
+ fn_fail:
+    goto fn_exit;
+}
+
+
 #undef FUNCNAME
 #undef FCNAME
 
diff --git a/src/mpid/ch3/src/ch3u_rma_ops.c b/src/mpid/ch3/src/ch3u_rma_ops.c
index 95d47b9..7189513 100644
--- a/src/mpid/ch3/src/ch3u_rma_ops.c
+++ b/src/mpid/ch3/src/ch3u_rma_ops.c
@@ -10,9 +10,6 @@
 static int enableShortACC=1;
 
 #ifdef USE_MPIU_INSTR
-MPIU_INSTR_DURATION_EXTERN_DECL(wincreate_allgather);
-MPIU_INSTR_DURATION_EXTERN_DECL(winfree_rs);
-MPIU_INSTR_DURATION_EXTERN_DECL(winfree_complete);
 MPIU_INSTR_DURATION_EXTERN_DECL(rmaqueue_alloc);
 MPIU_INSTR_DURATION_EXTERN_DECL(rmaqueue_set);
 extern void MPIDI_CH3_RMA_InitInstr(void);
@@ -55,37 +52,10 @@ int MPIDI_Win_free(MPID_Win **win_ptr)
     MPIU_ERR_CHKANDJUMP((*win_ptr)->epoch_state != MPIDI_EPOCH_NONE,
                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
 
-    comm_ptr = (*win_ptr)->comm_ptr;
-    MPIU_INSTR_DURATION_START(winfree_rs);
-    mpi_errno = MPIR_Reduce_scatter_block_impl((*win_ptr)->pt_rma_puts_accs, 
-                                               &total_pt_rma_puts_accs, 1, 
-                                               MPI_INT, MPI_SUM, comm_ptr, &errflag);
-    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
-    MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
-    MPIU_INSTR_DURATION_END(winfree_rs);
-
-    if (total_pt_rma_puts_accs != (*win_ptr)->my_pt_rma_puts_accs)
-    {
-	MPID_Progress_state progress_state;
-            
-	/* poke the progress engine until the two are equal */
-	MPIU_INSTR_DURATION_START(winfree_complete);
-	MPID_Progress_start(&progress_state);
-	while (total_pt_rma_puts_accs != (*win_ptr)->my_pt_rma_puts_accs)
-	{
-	    mpi_errno = MPID_Progress_wait(&progress_state);
-	    /* --BEGIN ERROR HANDLING-- */
-	    if (mpi_errno != MPI_SUCCESS)
-	    {
-		MPID_Progress_end(&progress_state);
-		MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress");
-	    }
-	    /* --END ERROR HANDLING-- */
-	}
-	MPID_Progress_end(&progress_state);
-	MPIU_INSTR_DURATION_END(winfree_complete);
-    }
+    mpi_errno = MPIDI_CH3I_Wait_for_pt_ops_finish(*win_ptr);
+    if(mpi_errno) MPIU_ERR_POP(mpi_errno);
 
+    comm_ptr = (*win_ptr)->comm_ptr;
     mpi_errno = MPIR_Comm_free_impl(comm_ptr);
     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
 

-----------------------------------------------------------------------

Summary of changes:
 src/mpid/ch3/channels/nemesis/src/ch3_rma_shm.c |   18 +-
 src/mpid/ch3/channels/nemesis/src/ch3_win_fns.c |  116 ++++-
 src/mpid/ch3/include/mpidpre.h                  |    1 +
 src/mpid/ch3/include/mpidrma.h                  |  606 +++++++++++++++++++++++
 src/mpid/ch3/src/ch3u_handle_recv_req.c         |   26 +-
 src/mpid/ch3/src/ch3u_rma_acc_ops.c             |  254 ++---------
 src/mpid/ch3/src/ch3u_rma_ops.c                 |  308 +++---------
 src/mpid/ch3/src/ch3u_rma_sync.c                |  120 +++++-
 src/mpid/ch3/src/ch3u_win_fns.c                 |    7 +
 src/mpid/ch3/src/mpid_rma.c                     |   19 +-
 test/mpi/rma/Makefile.am                        |    7 +
 test/mpi/rma/mcs-mutex.c                        |   10 +
 test/mpi/rma/mcs-mutex.h                        |    1 +
 test/mpi/rma/req_example.c                      |   13 +-
 test/mpi/rma/testlist                           |    2 +
 test/mpi/rma/win_info.c                         |    3 +
 16 files changed, 1023 insertions(+), 488 deletions(-)


hooks/post-receive
-- 
MPICH primary repository


More information about the commits mailing list